[iheartradio:podcast] Add new extractor

2024-11-23 00:54:31 +01:00 · 2020-08-19 22:07:29 +03:00 · 2020-08-19 22:07:29 +03:00 · 186a3bbdbf
commit 186a3bbdbf
parent 10709fc7c6
2 changed files with 196 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -456,6 +456,7 @@ from .ign import (
    OneUPIE,
    PCMagIE,
 )
 from .iheartradio import IHeartRadioPodcastIE
 from .imdb import (
    ImdbIE,
    ImdbListIE
--- a/youtube_dl/extractor/iheartradio.py
+++ b/youtube_dl/extractor/iheartradio.py
@ -0,0 +1,195 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import json
 import re
 import uuid
 from .common import InfoExtractor
 from ..compat import compat_str
 from numbers import Number
 from ..utils import (
    clean_html,
    str_or_none,
    urlencode_postdata
 )
 class IHeartRadioPodcastIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?iheart\.com/podcast/\d+-(?P<pod_title>[\w-]+)-(?P<pod_id>\d+)(?:/episode/(?P<title>[\w-]+)-(?P<id>\d+))?/?(?:embed=true)?'
    IE_NAME = "iheartradio:podcast"
    _TESTS = [{
        'url': 'https://www.iheart.com/podcast/1119-it-could-happen-here-30717896/',
        'info_dict': {
            'title': 'It Could Happen Here',
            'description': 'md5:5842117412a967eb0b01f8088eb663e2',
            'id': '30717896',
            'display_id': 'it-could-happen-here'
        },
        'playlist_count': 11
    }, {
        'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true',
        # IHeartRadio has ads, I can't tell if they're custom or not,
        # so the MD5 hash might be inconsistent
        'md5': 'c8609c92c8688dcb69d8541042b8abca',
        'info_dict': {
            'id': '70346499',
            'ext': 'mp3',
            'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus',
            'thumbnail': 'https://i.iheart.com/v3/catalog/podcast/29236323',
            'description': 'md5:96cc7297b3a5a9ebae28643801c96fae',
            'timestamp': 1597741200,
            'upload_date': '20200818',
            'duration': 4156,
            'display_id': 'part-one-alexander-lukashenko-the-dictator'
        }
    }]
    # To get the audio files, we have to use their internal API
    def _real_extract(self, url):
        match = re.match(self._VALID_URL, url)
        podcast_display_id = match.group('pod_title')
        podcast_id = match.group('pod_id')
        episode_display_id = match.group('title')
        episode_id = match.group('id')
        current_id = str_or_none(episode_id, default=podcast_id)
        is_single_episode = episode_id is not None
        # Don't load embed pages
        url = url.replace('?embed=true', '')
        webpage = self._download_webpage(url, current_id)
        # Register anonymous user, same behavior as web app
        random_device_id = compat_str(uuid.uuid4())
        random_oauth_id = compat_str(uuid.uuid4())
        register_user_values = urlencode_postdata({
            'accessToken': 'anon',
            'accessTokenType': 'anon',
            'deviceId': random_device_id,
            'deviceName': "web-desktop",
            'host': "webapp.WW",
            'oauthUuid': random_oauth_id,
            'userName': 'anon' + random_oauth_id
        })
        temp_user = self._download_json(
            'https://ww.api.iheart.com/api/v1/account/loginOrCreateOauthUser',
            current_id, "Registering temporary user", data=register_user_values,
            headers={'Accept': 'application/json, text/plain, */*',
                     'X-hostName': 'webapp.WW'})
        session_id = temp_user['sessionId']
        profile_id = temp_user['profileId']
        if (not is_single_episode):
            episode_ids = []
            for episode in self._get_all_episodes(podcast_id, temp_user):
                episode_ids.append(episode['id'])
        else:
            episode_ids = [episode_id]
        streams_values = json.dumps({
            'contentIds': episode_ids,
            'hostName': 'webapp.WW',
            'playedFrom': 6,  # Not sure about the meaning behind this, Firefox uses 6
            'stationId': compat_str(podcast_id),
            'stationType': 'PODCAST'
        }).encode('utf-8')
        stream_info = self._download_json(
            'https://ww.api.iheart.com/api/v2/playback/streams',
            current_id, "Requesting stream info", data=streams_values,
            headers={'Content-Type': 'application/json;charset=utf-8',
                     'X-Session-Id': session_id,
                     'X-User-Id': profile_id})
        # self.to_screen(stream_info)
        # Extract info from webpage (entirely optional)
        podcast_title = self._search_regex(
            r'<h1[^>]*>(?P<title>[^<]*)<\/h1>',
            webpage, 'title', fatal=False) or self._og_search_title(webpage)
        podcast_description = self._search_regex(
            r'<div class="[^"]*Body2-Description[^"]*">(?:<\/?span[^>]*>)*(?P<description>[^<]*)',
            webpage, 'description', fatal=False)
        if (is_single_episode):
            return self._real_extract_single(stream_info['items'][0],
                                             episode_display_id, podcast_id,
                                             podcast_title)
        else:
            entries = []
            for item in stream_info['items']:
                entries.append(self._real_extract_single(item,
                                                         episode_display_id,
                                                         podcast_id,
                                                         podcast_title))
            return {
                'title': podcast_title,
                'description': podcast_description,
                'id': podcast_id,
                'display_id': podcast_display_id,
                '_type': 'playlist',
                'entries': entries
            }
    def _real_extract_single(self, item_info, display_id, podcast_id, podcast_title):
        content_info = item_info['content']
        thumbnails = [{
            'url': 'https://i.iheart.com/v3/catalog/podcast/%s' % podcast_id,
            'width': 3000,
            'height': 3000
        }]
        # They have an API that dynamically generates images of a needed size
        # This is a list of "standard" sizes which are used by the web app
        for size in [75, 240, 480]:
            thumbnails.append({
                'url': 'https://i.iheart.com/v3/catalog/podcast/%(id)s?ops=fit(%(size)d, %(size)d)' % {
                    'id': podcast_id, 'size': size},
                'width': size,
                'height': size
            })
        # Release date timestamp is in milliseconds
        release_date = content_info.get('startDate')
        if (isinstance(release_date, Number) and release_date > 2000000000):
            release_date /= 1000
        # Remove analytics from stream URL (optional)
        streamUrl = item_info['streamUrl']
        streamUrl = re.sub(r'(?:www\.)?podtrac\.com/pts/redirect\.[\w]*/', '', streamUrl)
        streamUrl = re.sub(r'chtbl\.com/track/[\w]*/', '', streamUrl)
        streamUrl = re.sub(r'\?source=[\w]*', '', streamUrl)
        return {
            'id': compat_str(content_info['id']),
            'display_id': display_id,
            'title': content_info['title'],
            'description': clean_html(content_info.get('description')),
            'url': streamUrl,
            'duration': content_info.get('duration'),
            'timestamp': release_date,
            'thumbnails': thumbnails,
            'series': podcast_title
        }
    def _get_all_episodes(self, podcast_id, temp_user):
        episodes_info = self._download_json(
            'https://ww.api.iheart.com/api/v3/podcast/podcasts/%s/episodes?limit=100000' % (
                podcast_id),
            podcast_id, "Requesting episodes info",
            headers={'Accept': 'application/json, text/plain, */*',
                     'X-hostName': 'webapp.WW',
                     'X-Ihr-Profile-Id': temp_user['profileId']})
        return episodes_info['data']