Infoextractor for gaana.com

2025-01-07 13:47:54 +01:00 · 2018-12-15 12:02:35 +05:30 · 2018-12-15 12:02:35 +05:30 · b22aadb3a8
commit b22aadb3a8
parent 59e2803cb0
1 changed files with 174 additions and 0 deletions
--- a/youtube_dl/extractor/gaana.py
+++ b/youtube_dl/extractor/gaana.py
@ -0,0 +1,174 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from ..aes import aes_cbc_decrypt
+from ..compat import (
+    compat_b64decode,
+)
+
+from .common import InfoExtractor
+from ..utils import (
+    bytes_to_intlist,
+    intlist_to_bytes,
+    int_or_none,
+)
+
+
+class GaanaBaseIE(InfoExtractor):
+    _BASE_URL = 'https://gaana.com'
+
+    def _Decrypt(self, data):
+
+        key = 'Z0AxbiEoZjEjci4wJCkmJQ=='
+        iv = 'YXNkIUAjIUAjQCExMjMxMg=='
+
+        stream_url = intlist_to_bytes(aes_cbc_decrypt(
+            bytes_to_intlist(compat_b64decode(data)),
+            bytes_to_intlist(compat_b64decode(key)),
+            bytes_to_intlist(compat_b64decode(iv)))).decode()
+
+        s = stream_url[:-ord(stream_url[len(stream_url) - 1:])]
+        return s
+
+    def _create_entry(self, data, video_id):
+
+        raw_data = self._parse_json(data, video_id)
+
+        video_data = raw_data.get('path')
+        title = raw_data.get('title')
+        if not title:
+            print("No title found.")
+        thumbnail = raw_data.get('atw', '') or raw_data.get('albumartwork', '')
+        duration = raw_data.get('duration')
+
+        formats = []
+        if isinstance(video_data, dict):
+            for value in video_data.keys():
+                # need to skip auto
+                # this format and quaity is too dificult to handle for audio player.
+                # currently, it has been skipped
+                # in future this format also be going to available
+                if not value == 'auto':
+                    content = video_data.get(value)
+                    for k in content:
+                        format_url = self._Decrypt(k.get('message'))
+                        format_id = value
+
+                        formats.append({
+                            'url': format_url,
+                            'format_id': format_id,
+                            'ext': 'mp4'
+                        })
+
+            artist = raw_data.get('artist')
+
+            # Remove unwanted # character from string
+            if not artist:
+                artist = artist.replace(artist, "###", ', ')
+
+            return {
+                'id': video_id,
+                'title': title,
+                'description': raw_data.get('description'),
+                'duration': int_or_none(duration),
+                'formats': formats,
+                'album': raw_data.get('albumtitle'),
+                'thumbnail': thumbnail,
+                'artist': artist,
+                'release_date': raw_data.get('release_date'),
+                'language': raw_data.get('language')
+            }
+        else:
+            # we are here, beacause gaana.com uses cloudfont.com also
+            # alongwith some other sites for storage purpose.
+            # that will be implemented soon.
+            return None
+
+
+class GaanaIE(GaanaBaseIE):
+    IE_NAME = 'gaana'
+    _VALID_URL = r'https?://(?:www\.)?gaana\.com/song/(?P<id>[^/#?]+)'
+    _TESTS = [{
+        'url': 'https://gaana.com/song/jeeye-to-jeeye-kaise',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        raw_data = self._search_regex(
+            r'class="parentnode sourcelist_\d+"> (.*?) </span>',
+            webpage, 'raw data')
+        entry = self._create_entry(raw_data, video_id)
+        if entry:
+            return entry
+
+
+class GaanaAlbumIE(GaanaBaseIE):
+    IE_NAME = 'gaana:album'
+    _VALID_URL = r'https?://(?:www\.)?gaana\.com/album/(?P<id>[^/#?]+)'
+    _TESTS = [{
+        'url': 'https://gaana.com/album/saajan-hindi',
+        'playlist_mincount': 1,
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        webpage = self._download_webpage(url, playlist_id)
+        # print(webpage)
+        matchobj = re.findall(r'class="parentnode sourcelist_\d+"> (.*?) </span>', webpage)
+        entries = []
+        for g in matchobj:
+            entry = self._create_entry(g, playlist_id)
+            if entry:
+                entries.append(self._create_entry(g, playlist_id))
+
+        return self.playlist_result(entries, playlist_id)
+
+
+class GaanaArtistIE(GaanaBaseIE):
+    IE_NAME = 'gaana:artist'
+    _VALID_URL = r'https?://(?:www\.)?gaana\.com/artist/(?P<id>[^/#?]+)'
+    _TESTS = [{
+        'url': 'https://gaana.com/artist/kumar-sanu',
+        'playlist_mincount': 1,
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        webpage = self._download_webpage(url, playlist_id)
+        urls = self._read_entry(webpage, playlist_id)
+        entries = []
+
+        for g in urls:
+            video_id = g.replace('https://gaana.com/song/', '')
+            webpage = self._download_webpage(g, video_id)
+
+            raw_data = self._search_regex(
+                r'class="parentnode sourcelist_\d+"> (.*?) </span>',
+                webpage, 'raw data')
+
+            entry = self._create_entry(raw_data, playlist_id)
+
+            if entry:
+                entries.append(entry)
+
+        return self.playlist_result(entries, playlist_id)
+
+    def _read_entry(self, webpage, video_id):
+        entries = []
+        matchobj = re.findall(r'class="parentnode sourcelist_\d+"> (.*?) </span>', webpage)
+
+        for g in matchobj:
+            raw_data = self._parse_json(g, video_id)
+            new_url = raw_data.get('share_url')
+
+            if new_url:
+                new_url = self._BASE_URL + new_url
+                entries.append(new_url)
+
+        return entries