mirror of
https://codeberg.org/polarisfm/youtube-dl
synced 2024-11-22 16:44:32 +01:00
[iheartradio:podcast] Add new extractor
This commit is contained in:
parent
10709fc7c6
commit
186a3bbdbf
@ -456,6 +456,7 @@ from .ign import (
|
||||
OneUPIE,
|
||||
PCMagIE,
|
||||
)
|
||||
from .iheartradio import IHeartRadioPodcastIE
|
||||
from .imdb import (
|
||||
ImdbIE,
|
||||
ImdbListIE
|
||||
|
195
youtube_dl/extractor/iheartradio.py
Normal file
195
youtube_dl/extractor/iheartradio.py
Normal file
@ -0,0 +1,195 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
import re
|
||||
import uuid
|
||||
|
||||
from .common import InfoExtractor
|
||||
|
||||
from ..compat import compat_str
|
||||
|
||||
from numbers import Number
|
||||
|
||||
from ..utils import (
|
||||
clean_html,
|
||||
str_or_none,
|
||||
urlencode_postdata
|
||||
)
|
||||
|
||||
|
||||
class IHeartRadioPodcastIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?iheart\.com/podcast/\d+-(?P<pod_title>[\w-]+)-(?P<pod_id>\d+)(?:/episode/(?P<title>[\w-]+)-(?P<id>\d+))?/?(?:embed=true)?'
|
||||
IE_NAME = "iheartradio:podcast"
|
||||
_TESTS = [{
|
||||
'url': 'https://www.iheart.com/podcast/1119-it-could-happen-here-30717896/',
|
||||
'info_dict': {
|
||||
'title': 'It Could Happen Here',
|
||||
'description': 'md5:5842117412a967eb0b01f8088eb663e2',
|
||||
'id': '30717896',
|
||||
'display_id': 'it-could-happen-here'
|
||||
},
|
||||
'playlist_count': 11
|
||||
}, {
|
||||
'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true',
|
||||
# IHeartRadio has ads, I can't tell if they're custom or not,
|
||||
# so the MD5 hash might be inconsistent
|
||||
'md5': 'c8609c92c8688dcb69d8541042b8abca',
|
||||
'info_dict': {
|
||||
'id': '70346499',
|
||||
'ext': 'mp3',
|
||||
'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus',
|
||||
'thumbnail': 'https://i.iheart.com/v3/catalog/podcast/29236323',
|
||||
'description': 'md5:96cc7297b3a5a9ebae28643801c96fae',
|
||||
'timestamp': 1597741200,
|
||||
'upload_date': '20200818',
|
||||
'duration': 4156,
|
||||
'display_id': 'part-one-alexander-lukashenko-the-dictator'
|
||||
}
|
||||
}]
|
||||
|
||||
# To get the audio files, we have to use their internal API
|
||||
def _real_extract(self, url):
|
||||
match = re.match(self._VALID_URL, url)
|
||||
|
||||
podcast_display_id = match.group('pod_title')
|
||||
podcast_id = match.group('pod_id')
|
||||
|
||||
episode_display_id = match.group('title')
|
||||
episode_id = match.group('id')
|
||||
|
||||
current_id = str_or_none(episode_id, default=podcast_id)
|
||||
|
||||
is_single_episode = episode_id is not None
|
||||
|
||||
# Don't load embed pages
|
||||
url = url.replace('?embed=true', '')
|
||||
webpage = self._download_webpage(url, current_id)
|
||||
|
||||
# Register anonymous user, same behavior as web app
|
||||
random_device_id = compat_str(uuid.uuid4())
|
||||
random_oauth_id = compat_str(uuid.uuid4())
|
||||
|
||||
register_user_values = urlencode_postdata({
|
||||
'accessToken': 'anon',
|
||||
'accessTokenType': 'anon',
|
||||
'deviceId': random_device_id,
|
||||
'deviceName': "web-desktop",
|
||||
'host': "webapp.WW",
|
||||
'oauthUuid': random_oauth_id,
|
||||
'userName': 'anon' + random_oauth_id
|
||||
})
|
||||
temp_user = self._download_json(
|
||||
'https://ww.api.iheart.com/api/v1/account/loginOrCreateOauthUser',
|
||||
current_id, "Registering temporary user", data=register_user_values,
|
||||
headers={'Accept': 'application/json, text/plain, */*',
|
||||
'X-hostName': 'webapp.WW'})
|
||||
|
||||
session_id = temp_user['sessionId']
|
||||
profile_id = temp_user['profileId']
|
||||
|
||||
if (not is_single_episode):
|
||||
episode_ids = []
|
||||
for episode in self._get_all_episodes(podcast_id, temp_user):
|
||||
episode_ids.append(episode['id'])
|
||||
else:
|
||||
episode_ids = [episode_id]
|
||||
|
||||
streams_values = json.dumps({
|
||||
'contentIds': episode_ids,
|
||||
'hostName': 'webapp.WW',
|
||||
'playedFrom': 6, # Not sure about the meaning behind this, Firefox uses 6
|
||||
'stationId': compat_str(podcast_id),
|
||||
'stationType': 'PODCAST'
|
||||
}).encode('utf-8')
|
||||
stream_info = self._download_json(
|
||||
'https://ww.api.iheart.com/api/v2/playback/streams',
|
||||
current_id, "Requesting stream info", data=streams_values,
|
||||
headers={'Content-Type': 'application/json;charset=utf-8',
|
||||
'X-Session-Id': session_id,
|
||||
'X-User-Id': profile_id})
|
||||
|
||||
# self.to_screen(stream_info)
|
||||
|
||||
# Extract info from webpage (entirely optional)
|
||||
|
||||
podcast_title = self._search_regex(
|
||||
r'<h1[^>]*>(?P<title>[^<]*)<\/h1>',
|
||||
webpage, 'title', fatal=False) or self._og_search_title(webpage)
|
||||
|
||||
podcast_description = self._search_regex(
|
||||
r'<div class="[^"]*Body2-Description[^"]*">(?:<\/?span[^>]*>)*(?P<description>[^<]*)',
|
||||
webpage, 'description', fatal=False)
|
||||
|
||||
if (is_single_episode):
|
||||
return self._real_extract_single(stream_info['items'][0],
|
||||
episode_display_id, podcast_id,
|
||||
podcast_title)
|
||||
else:
|
||||
entries = []
|
||||
for item in stream_info['items']:
|
||||
entries.append(self._real_extract_single(item,
|
||||
episode_display_id,
|
||||
podcast_id,
|
||||
podcast_title))
|
||||
return {
|
||||
'title': podcast_title,
|
||||
'description': podcast_description,
|
||||
'id': podcast_id,
|
||||
'display_id': podcast_display_id,
|
||||
'_type': 'playlist',
|
||||
'entries': entries
|
||||
}
|
||||
|
||||
def _real_extract_single(self, item_info, display_id, podcast_id, podcast_title):
|
||||
content_info = item_info['content']
|
||||
|
||||
thumbnails = [{
|
||||
'url': 'https://i.iheart.com/v3/catalog/podcast/%s' % podcast_id,
|
||||
'width': 3000,
|
||||
'height': 3000
|
||||
}]
|
||||
|
||||
# They have an API that dynamically generates images of a needed size
|
||||
# This is a list of "standard" sizes which are used by the web app
|
||||
for size in [75, 240, 480]:
|
||||
thumbnails.append({
|
||||
'url': 'https://i.iheart.com/v3/catalog/podcast/%(id)s?ops=fit(%(size)d, %(size)d)' % {
|
||||
'id': podcast_id, 'size': size},
|
||||
'width': size,
|
||||
'height': size
|
||||
})
|
||||
|
||||
# Release date timestamp is in milliseconds
|
||||
release_date = content_info.get('startDate')
|
||||
if (isinstance(release_date, Number) and release_date > 2000000000):
|
||||
release_date /= 1000
|
||||
|
||||
# Remove analytics from stream URL (optional)
|
||||
streamUrl = item_info['streamUrl']
|
||||
streamUrl = re.sub(r'(?:www\.)?podtrac\.com/pts/redirect\.[\w]*/', '', streamUrl)
|
||||
streamUrl = re.sub(r'chtbl\.com/track/[\w]*/', '', streamUrl)
|
||||
streamUrl = re.sub(r'\?source=[\w]*', '', streamUrl)
|
||||
|
||||
return {
|
||||
'id': compat_str(content_info['id']),
|
||||
'display_id': display_id,
|
||||
'title': content_info['title'],
|
||||
'description': clean_html(content_info.get('description')),
|
||||
'url': streamUrl,
|
||||
'duration': content_info.get('duration'),
|
||||
'timestamp': release_date,
|
||||
'thumbnails': thumbnails,
|
||||
'series': podcast_title
|
||||
}
|
||||
|
||||
def _get_all_episodes(self, podcast_id, temp_user):
|
||||
episodes_info = self._download_json(
|
||||
'https://ww.api.iheart.com/api/v3/podcast/podcasts/%s/episodes?limit=100000' % (
|
||||
podcast_id),
|
||||
podcast_id, "Requesting episodes info",
|
||||
headers={'Accept': 'application/json, text/plain, */*',
|
||||
'X-hostName': 'webapp.WW',
|
||||
'X-Ihr-Profile-Id': temp_user['profileId']})
|
||||
|
||||
return episodes_info['data']
|
Loading…
Reference in New Issue
Block a user