From 6635b191fc0f63dbad9ba148370f317390377c4c Mon Sep 17 00:00:00 2001 From: Awal Garg Date: Wed, 9 Jan 2019 02:46:06 +0530 Subject: [PATCH] [JioSaavn] Add extractor --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/jiosaavn.py | 118 +++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 youtube_dl/extractor/jiosaavn.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ae7079a6a..1fe5c3978 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -495,6 +495,10 @@ from .jamendo import ( JamendoAlbumIE, ) from .jeuxvideo import JeuxVideoIE +from .jiosaavn import ( + JioSaavnSongIE, + JioSaavnAlbumIE, +) from .jove import JoveIE from .joj import JojIE from .jwplatform import JWPlatformIE diff --git a/youtube_dl/extractor/jiosaavn.py b/youtube_dl/extractor/jiosaavn.py new file mode 100644 index 000000000..f40e513ad --- /dev/null +++ b/youtube_dl/extractor/jiosaavn.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import re +import string +import time + +from .common import InfoExtractor +from ..utils import ( + unescapeHTML, + urlencode_postdata, +) + + +class JioSaavnBaseInfoExtractor(InfoExtractor): + def extract_initial_data_as_json(self, url, id): + webpage = self._download_webpage(url, id) + init_data_re = r'window.__INITIAL_DATA__\s*=\s*(?P.+?);*\s*\' + + def sanitize_init_data(init_data): + new_date_call_re = r'new Date\(.+?\)' + return re.sub(new_date_call_re, '"%s"' % time.ctime(), unescapeHTML(init_data)) + + init_data = self._parse_json(self._search_regex(init_data_re, webpage, + 'init-json'), + id, transform_source=sanitize_init_data) + return init_data + + +class JioSaavnSongIE(JioSaavnBaseInfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)? + (?: + jiosaavn\.com/song/[^/]+/| + saavn.com/s/song/(?:[^/]+/){3} + ) + (?P[\w\d]+) + ''' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk', + 'md5': '7b1f70de088ede3a152ea34aece4df42', + 'info_dict': { + 'id': 'OQsEfQFVUXk', + 'ext': 'mp3', + 'title': 'Leja Re', + 'album': 'Leja Re', + }, + }, { + 'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU', + 'only_matching': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + fp = ''.join([random.choice(string.hexdigits) for _ in range(32)]) + + # This is required, else server won't return the media_url (extracted below) + self._download_webpage('https://www.jiosaavn.com/stats.php', audio_id, + query={ + 'ev': 'site:browser:fp', + 'fp': fp, + '_t': int(time.time()), + 'ct': '00000000', + }) + + song_data = self.extract_initial_data_as_json(url, audio_id)['song']['song'] + + data = urlencode_postdata({'__call': 'song.generateAuthToken', + '_format': 'json', + 'bitrate': '128', + 'url': song_data['encrypted_media_url'], + }) + + def clean_api_json(resp): + return self._search_regex(r'(?P\{.+?})', resp, 'api-json') + + media_url = self._download_json('https://www.jiosaavn.com/api.php', + audio_id, data=data, + transform_source=clean_api_json, + )['auth_url'] + + return { + 'id': audio_id, + 'title': song_data['title']['text'], + 'formats': [{ + 'url': media_url, + 'ext': 'mp3', + 'format_note': 'MPEG audio', + 'format_id': '128', + 'vcodec': 'none', + }], + 'album': song_data.get('album', {}).get('text'), + 'thumbnail': song_data.get('image', [None])[0], + } + + +class JioSaavnAlbumIE(JioSaavnBaseInfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)? + (?:(?:jio)?saavn\.com/album/[^/]+/) + (?P[\w\d]+) + ''' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/album/96/buIOjYZDrNA_', + 'info_dict': { + 'id': 'buIOjYZDrNA_', + 'title': '96', + }, + 'playlist_count': 10, + }] + + def _real_extract(self, url): + album_id = self._match_id(url) + album_data = self.extract_initial_data_as_json(url, album_id)['albumView']['album'] + songs = [self.url_result(song['perma_url']) for song in album_data['songs']] + + return self.playlist_result(songs, album_id, album_data.get('title', {}).get('text'))