diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4b3092028..41dcc5d54 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -456,6 +456,7 @@ from .ign import ( OneUPIE, PCMagIE, ) +from .iheartradio import IHeartRadioPodcastIE from .imdb import ( ImdbIE, ImdbListIE diff --git a/youtube_dl/extractor/iheartradio.py b/youtube_dl/extractor/iheartradio.py new file mode 100644 index 000000000..2b49f9b13 --- /dev/null +++ b/youtube_dl/extractor/iheartradio.py @@ -0,0 +1,195 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re +import uuid + +from .common import InfoExtractor + +from ..compat import compat_str + +from numbers import Number + +from ..utils import ( + clean_html, + str_or_none, + urlencode_postdata +) + + +class IHeartRadioPodcastIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?iheart\.com/podcast/\d+-(?P[\w-]+)-(?P\d+)(?:/episode/(?P[\w-]+)-(?P<id>\d+))?/?(?:embed=true)?' + IE_NAME = "iheartradio:podcast" + _TESTS = [{ + 'url': 'https://www.iheart.com/podcast/1119-it-could-happen-here-30717896/', + 'info_dict': { + 'title': 'It Could Happen Here', + 'description': 'md5:5842117412a967eb0b01f8088eb663e2', + 'id': '30717896', + 'display_id': 'it-could-happen-here' + }, + 'playlist_count': 11 + }, { + 'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true', + # IHeartRadio has ads, I can't tell if they're custom or not, + # so the MD5 hash might be inconsistent + 'md5': 'c8609c92c8688dcb69d8541042b8abca', + 'info_dict': { + 'id': '70346499', + 'ext': 'mp3', + 'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus', + 'thumbnail': 'https://i.iheart.com/v3/catalog/podcast/29236323', + 'description': 'md5:96cc7297b3a5a9ebae28643801c96fae', + 'timestamp': 1597741200, + 'upload_date': '20200818', + 'duration': 4156, + 'display_id': 'part-one-alexander-lukashenko-the-dictator' + } + }] + + # To get the audio files, we have to use their internal API + def _real_extract(self, url): + match = re.match(self._VALID_URL, url) + + podcast_display_id = match.group('pod_title') + podcast_id = match.group('pod_id') + + episode_display_id = match.group('title') + episode_id = match.group('id') + + current_id = str_or_none(episode_id, default=podcast_id) + + is_single_episode = episode_id is not None + + # Don't load embed pages + url = url.replace('?embed=true', '') + webpage = self._download_webpage(url, current_id) + + # Register anonymous user, same behavior as web app + random_device_id = compat_str(uuid.uuid4()) + random_oauth_id = compat_str(uuid.uuid4()) + + register_user_values = urlencode_postdata({ + 'accessToken': 'anon', + 'accessTokenType': 'anon', + 'deviceId': random_device_id, + 'deviceName': "web-desktop", + 'host': "webapp.WW", + 'oauthUuid': random_oauth_id, + 'userName': 'anon' + random_oauth_id + }) + temp_user = self._download_json( + 'https://ww.api.iheart.com/api/v1/account/loginOrCreateOauthUser', + current_id, "Registering temporary user", data=register_user_values, + headers={'Accept': 'application/json, text/plain, */*', + 'X-hostName': 'webapp.WW'}) + + session_id = temp_user['sessionId'] + profile_id = temp_user['profileId'] + + if (not is_single_episode): + episode_ids = [] + for episode in self._get_all_episodes(podcast_id, temp_user): + episode_ids.append(episode['id']) + else: + episode_ids = [episode_id] + + streams_values = json.dumps({ + 'contentIds': episode_ids, + 'hostName': 'webapp.WW', + 'playedFrom': 6, # Not sure about the meaning behind this, Firefox uses 6 + 'stationId': compat_str(podcast_id), + 'stationType': 'PODCAST' + }).encode('utf-8') + stream_info = self._download_json( + 'https://ww.api.iheart.com/api/v2/playback/streams', + current_id, "Requesting stream info", data=streams_values, + headers={'Content-Type': 'application/json;charset=utf-8', + 'X-Session-Id': session_id, + 'X-User-Id': profile_id}) + + # self.to_screen(stream_info) + + # Extract info from webpage (entirely optional) + + podcast_title = self._search_regex( + r'<h1[^>]*>(?P<title>[^<]*)<\/h1>', + webpage, 'title', fatal=False) or self._og_search_title(webpage) + + podcast_description = self._search_regex( + r'<div class="[^"]*Body2-Description[^"]*">(?:<\/?span[^>]*>)*(?P<description>[^<]*)', + webpage, 'description', fatal=False) + + if (is_single_episode): + return self._real_extract_single(stream_info['items'][0], + episode_display_id, podcast_id, + podcast_title) + else: + entries = [] + for item in stream_info['items']: + entries.append(self._real_extract_single(item, + episode_display_id, + podcast_id, + podcast_title)) + return { + 'title': podcast_title, + 'description': podcast_description, + 'id': podcast_id, + 'display_id': podcast_display_id, + '_type': 'playlist', + 'entries': entries + } + + def _real_extract_single(self, item_info, display_id, podcast_id, podcast_title): + content_info = item_info['content'] + + thumbnails = [{ + 'url': 'https://i.iheart.com/v3/catalog/podcast/%s' % podcast_id, + 'width': 3000, + 'height': 3000 + }] + + # They have an API that dynamically generates images of a needed size + # This is a list of "standard" sizes which are used by the web app + for size in [75, 240, 480]: + thumbnails.append({ + 'url': 'https://i.iheart.com/v3/catalog/podcast/%(id)s?ops=fit(%(size)d, %(size)d)' % { + 'id': podcast_id, 'size': size}, + 'width': size, + 'height': size + }) + + # Release date timestamp is in milliseconds + release_date = content_info.get('startDate') + if (isinstance(release_date, Number) and release_date > 2000000000): + release_date /= 1000 + + # Remove analytics from stream URL (optional) + streamUrl = item_info['streamUrl'] + streamUrl = re.sub(r'(?:www\.)?podtrac\.com/pts/redirect\.[\w]*/', '', streamUrl) + streamUrl = re.sub(r'chtbl\.com/track/[\w]*/', '', streamUrl) + streamUrl = re.sub(r'\?source=[\w]*', '', streamUrl) + + return { + 'id': compat_str(content_info['id']), + 'display_id': display_id, + 'title': content_info['title'], + 'description': clean_html(content_info.get('description')), + 'url': streamUrl, + 'duration': content_info.get('duration'), + 'timestamp': release_date, + 'thumbnails': thumbnails, + 'series': podcast_title + } + + def _get_all_episodes(self, podcast_id, temp_user): + episodes_info = self._download_json( + 'https://ww.api.iheart.com/api/v3/podcast/podcasts/%s/episodes?limit=100000' % ( + podcast_id), + podcast_id, "Requesting episodes info", + headers={'Accept': 'application/json, text/plain, */*', + 'X-hostName': 'webapp.WW', + 'X-Ihr-Profile-Id': temp_user['profileId']}) + + return episodes_info['data']