[simplecast] Add new extractor

2024-11-22 16:44:32 +01:00 · 2020-02-20 14:33:05 -06:00 · 2020-02-20 14:33:05 -06:00 · 08c47b7af9
commit 08c47b7af9
parent 97c822b3d5
3 changed files with 338 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -991,6 +991,12 @@ from .shared import (
 from .showroomlive import ShowRoomLiveIE
 from .sina import SinaIE
 from .sixplay import SixPlayIE
+from .simplecast import (
+        SimplecastIE,
+        SimplecastEmbedIE,
+        SimplecastEpisodeIE,
+        SimplecastPodcastIE,
+)
 from .skylinewebcams import SkylineWebcamsIE
 from .skynewsarabia import (
    SkyNewsArabiaIE,
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -2659,6 +2659,11 @@ class GenericIE(InfoExtractor):
            return self.playlist_from_matches(
                matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')

+        # Look for simplecast embeds
+        matches = re.findall(r'<iframe[^>]+?src="(https?://(?:embed|player)\.simplecast\.com/[^"]+)"', webpage)
+        if matches:
+            return self.playlist_from_matches(matches, ie='SimplecastEmbed')
+
        # Look for BBC iPlayer embed
        matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
        if matches:
--- a/youtube_dl/extractor/simplecast.py
+++ b/youtube_dl/extractor/simplecast.py
@ -0,0 +1,327 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+from ..compat import (
+    compat_urllib_parse_unquote_to_bytes,
+)
+
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    OnDemandPagedList,
+    strip_or_none,
+    unified_timestamp,
+    url_or_none,
+)
+
+
+class SimplecastIE(InfoExtractor):
+    IE_NAME = 'simplecast'
+    _VALID_URL = r'https://api\.simplecast\.com/episodes/(?P<id>[^?/]+)'
+    _TEST = {
+        'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876',
+        'md5': '8c93be7be54251bf29ee97464eabe61c',
+        'info_dict': {
+            'display_id': 'errant-signal-chris-franklin-new-wave-video-essays',
+            'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
+            'ext': 'mp3',
+            'title': 'Errant Signal - Chris Franklin & New Wave Video Essays',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'episode_number': 1,
+            'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
+            'description': 'md5:72c89d3ae63a77a6c55ce8becf170f2e',
+            'season_number': 1,
+            'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13',
+            'series': 'The RE:BIND.io Podcast',
+            'duration': 5343,
+            'timestamp': 1580979475,
+            'upload_date': '20200206',
+            'webpage_url': r're:^https://.+\.simplecast.com/episodes/[^?/]+',
+            'channel_url': r're:^https://.+\.simplecast.com$',
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        meta = self._download_json(
+            url, display_id,
+            expected_status=404)
+
+        if meta.get('status') == 404:
+            raise ExtractorError(
+                'The requested episode could not be found', expected=True)
+
+        summary = strip_or_none(meta.get('description'))
+        long_description = strip_or_none(
+            clean_html(meta.get('long_description')))
+        description = summary or long_description
+        if summary and long_description:
+            description = summary + '\n\n' + long_description
+
+        season_href = url_or_none(meta.get('season', {}).get('href'))
+        season_id = None
+        if season_href:
+            id_regex = re.match(
+                r'https?://api.simplecast.com/seasons/(?P<id>[^?]+)',
+                season_href)
+            if id_regex:
+                season_id = id_regex.group('id')
+
+        webpage_url = url_or_none(meta.get('episode_url'))
+        channel_url = None
+        if webpage_url:
+            channel_regex = re.match(
+                r'(?P<wp>https?://.+\.simplecast\.com)',
+                webpage_url)
+            if channel_regex:
+                channel_url = channel_regex.group('wp')
+
+        return {
+            'id': meta['id'],
+            'display_id': meta.get('slug') or display_id,
+            'title': meta['title'],
+            'url': meta['audio_file_url'],
+            'webpage_url': webpage_url,
+            'channel_url': channel_url,
+            'series': strip_or_none(meta.get('podcast', {}).get('title')),
+            'season_number': int_or_none(meta.get('season', {}).get('number')),
+            'season_id': season_id,
+            'thumbnail': url_or_none(
+                meta.get('image_url')
+                or meta.get('podcast', {}).get('image_url')),
+            'episode_id': meta.get('id'),
+            'episode_number': int_or_none(meta.get('number')),
+            'description': description,
+            'timestamp': unified_timestamp(meta.get('published_at')),
+            'duration': int_or_none(meta.get('duration')),
+        }
+
+
+class SimplecastEmbedIE(InfoExtractor):
+    IE_NAME = 'simplecast:embed'
+    _VALID_URL = r'https?://(?:embed|player)\.simplecast\.com/(?P<id>[^?]+)'
+    _TESTS = [{
+        'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876',
+        'md5': '8c93be7be54251bf29ee97464eabe61c',
+        'info_dict': {
+            'display_id': 'errant-signal-chris-franklin-new-wave-video-essays',
+            'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
+            'ext': 'mp3',
+            'title': 'Errant Signal - Chris Franklin & New Wave Video Essays',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'episode_number': 1,
+            'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
+            'description': 'md5:72c89d3ae63a77a6c55ce8becf170f2e',
+            'season_number': 1,
+            'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13',
+            'series': 'The RE:BIND.io Podcast',
+            'duration': 5343,
+            'timestamp': 1580979475,
+            'upload_date': '20200206',
+            'webpage_url': r're:^https://.+\.simplecast.com/episodes/[^?/]+',
+            'channel_url': r're:^https://.+\.simplecast.com$',
+        }
+    }, {
+        'url': 'https://embed.simplecast.com/0bab9525',
+        'md5': '580b1cc614c8ba33d929aaeeb38c274b',
+        'info_dict': {
+            'id': '565b3059-5227-4439-86e7-3eb1d43bf209',
+            'ext': 'mp3',
+            'title': 'Talib Kweli',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'episode_number': 22,
+            'episode_id': '565b3059-5227-4439-86e7-3eb1d43bf209',
+            'description': 'md5:d9d22ebcc84b6938efd3896fd18cc047',
+            'season_number': 1,
+            'season_id': 'b9df4072-7bd7-4704-b314-8dba46369de4',
+            'series': 'Armchair Expert with Dax Shepard',
+            'duration': 10435,
+            'timestamp': 1528707600,
+            'upload_date': '20180611',
+            'webpage_url': r're:^https://.+\.simplecast.com/episodes/[^?/]+',
+            'channel_url': r're:^https://.+\.simplecast.com$',
+        }
+    }]
+
+    def _real_extract(self, url):
+        if re.match(r'https?://embed\.', url):
+            disp_id = self._match_id(url)
+            return self.url_result(
+                self._request_webpage(
+                    'https://embed.simplecast.com/{0}'.format(disp_id), disp_id,
+                    'Resolving ID', 'Unable to resolve ID').geturl(),
+                'SimplecastEmbed')
+        episode_id = self._match_id(url)
+        return self.url_result(
+            'https://api.simplecast.com/episodes/{0}'.format(episode_id),
+            'Simplecast', episode_id)
+
+
+class SimplecastEpisodeIE(InfoExtractor):
+    IE_NAME = 'simplecast:episode'
+    _VALID_URL = r'https?://(?!(?:cdn|embed|player|api|www)\.).*\.simplecast\.com/episodes/(?P<id>[^?]+)'
+    _TEST = {
+        'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays',
+        'md5': '8c93be7be54251bf29ee97464eabe61c',
+        'info_dict': {
+            'display_id': 'errant-signal-chris-franklin-new-wave-video-essays',
+            'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
+            'ext': 'mp3',
+            'title': 'Errant Signal - Chris Franklin & New Wave Video Essays',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'episode_number': 1,
+            'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
+            'description': 'md5:72c89d3ae63a77a6c55ce8becf170f2e',
+            'season_number': 1,
+            'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13',
+            'series': 'The RE:BIND.io Podcast',
+            'duration': 5343,
+            'timestamp': 1580979475,
+            'upload_date': '20200206',
+            'webpage_url': r're:^https://.+\.simplecast.com/episodes/[^?/]+',
+            'channel_url': r're:^https://.+\.simplecast.com$',
+        }
+    }
+
+    def _real_extract(self, url):
+        url = url.rstrip('/')
+        display_id = self._match_id(url)
+
+        search_result = self._download_json(
+            'https://api.simplecast.com/episodes/search', display_id,
+            'Looking up episode info', 'Unable to look up episode info',
+            data=compat_urllib_parse_unquote_to_bytes(
+                '{{"url":"{0}"}}'.format(url)),
+            headers={'Content-Type': 'application/json;charset=utf-8'},
+            expected_status=404)
+
+        if search_result.get('status') == 404:
+            raise ExtractorError(
+                'The requested episode could not be found', expected=True)
+
+        episode_id = search_result['id']
+        self.to_screen('{0}: Real ID is {1}'.format(display_id, episode_id))
+
+        return self.url_result(
+            'https://api.simplecast.com/episodes/{0}'.format(episode_id),
+            'Simplecast', episode_id, search_result.get('title'))
+
+
+class SimplecastPodcastIE(InfoExtractor):
+    IE_NAME = 'simplecast:podcast'
+    _VALID_URL = r'https?://(?!(?:cdn|embed|player|api|www))(?P<id>.+)\.simplecast\.com(?:/episodes/?)?'
+    _PAGE_SIZE = 25
+    _PAGE_TOTAL = '?'
+    _TEST = {
+        'url': 'https://the-re-bind-io-podcast.simplecast.com',
+        'playlist_mincount': 3,
+        'info_dict': {
+            'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c',
+            'description': 'md5:5b83928525a22effaee9dd5c2addc378',
+            'title': 'The RE:BIND.io Podcast',
+        }
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return (False
+                if SimplecastEpisodeIE.suitable(url)
+                else super(SimplecastPodcastIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        podcast_id = self._match_id(url)
+
+        search_result = self._download_json(
+            'https://api.simplecast.com/sites/search', podcast_id,
+            'Looking up podcast info', 'Unable to look up podcast info',
+            data=compat_urllib_parse_unquote_to_bytes(
+                '{{"url":"{0}"}}'.format(url)),
+            headers={'Content-Type': 'application/json;charset=utf-8'},
+            expected_status=404)
+
+        if search_result.get('status') == 404:
+            raise ExtractorError(
+                'The requested podcast could not be found', expected=True)
+
+        series_id = search_result['podcast']['id']
+
+        pod_meta = self._download_json(
+            'https://api.simplecast.com/podcasts/{0}'.format(series_id),
+            podcast_id, 'Downloading podcast metadata',
+            'Unable to download podcast metadata',
+            fatal=False)
+
+        series_description = series_thumbnail = license = None
+        if type(pod_meta) is dict:
+            series_description = strip_or_none(pod_meta.get('description'))
+            license = strip_or_none(pod_meta.get('copyright'))
+            series_thumbnail = pod_meta.get('image_url')
+
+        season_list = self._download_json(
+            'https://api.simplecast.com/podcasts/{0}/seasons'.format(series_id),
+            podcast_id, 'Downloading season list',
+            'Unable to download season list')
+
+        if len(season_list['collection']) > 1:
+            # Haven't seen anything with multiple seasons,
+            # not sure how to handle that
+            raise ExtractorError(
+                'Support for podcasts with more than one season'
+                ' has not been implemented')
+        for season in season_list['collection']:
+            season_number = int_or_none(season.get('number'))
+            season_id = re.match(
+                r'https?://api.simplecast.com/seasons/(?P<id>[^?]+)',
+                season['href']).group('id')
+
+        series = strip_or_none(search_result.get('podcast', {}).get('title'))
+        subdomain = search_result.get('subdomain')
+        channel_url = None
+        if subdomain:
+            channel_url = 'https://{0}.simplecast.com'.format(subdomain)
+
+        def get_page(page_num):
+            episode_list = self._download_json(
+                'https://api.simplecast.com/seasons/{0}/episodes?limit={1}&offset={2}'.format(
+                    season_id, self._PAGE_SIZE, self._PAGE_SIZE * page_num),
+                podcast_id,
+                'Downloading episode list page {0} of {1}'.format(
+                    page_num + 1, self._PAGE_TOTAL),
+                'Unable to download episode list page {0} of {1}'.format(
+                    page_num + 1, self._PAGE_TOTAL))
+            self._PAGE_TOTAL = (
+                int_or_none(episode_list.get('pages', {}).get('total'))
+                or '?')
+
+            for episode in episode_list['collection']:
+                yield {
+                    '_type': 'url',
+                    'ie_key': 'Simplecast',
+                    'url': 'https://api.simplecast.com/episodes/{0}'.format(episode['id']),
+                    'id': episode['id'],
+                    'display_id': episode.get('slug'),
+                    'title': episode.get('title'),
+                    'webpage_url': url_or_none(search_result.get('episode_url')),
+                    'channel_id': search_result.get('id'),
+                    'channel_url': channel_url,
+                    'series': series,
+                    'season_id': season_id,
+                    'season_number': season_number,
+                    'thumbnail': url_or_none(episode.get('image_url') or series_thumbnail),
+                    'episode_id': episode['id'],
+                    'episode_number': int_or_none(episode.get('number')),
+                    'description': strip_or_none(episode.get('description')),
+                    'timestamp': unified_timestamp(episode.get('published_at')),
+                    'license': license,
+                }
+
+        return self.playlist_result(
+            OnDemandPagedList(get_page, self._PAGE_SIZE),
+            series_id, series, series_description)