[wetv] Add new extractor

2025-01-07 13:47:54 +01:00 · 2020-10-05 02:54:46 +02:00 · 2020-10-05 02:54:46 +02:00 · 9c42c640eb
commit 9c42c640eb
parent d65d89183f
2 changed files with 493 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1409,6 +1409,10 @@ from .weibo import (
    WeiboMobileIE
 )
 from .weiqitv import WeiqiTVIE
+from .wetv import (
+    WeTvIE,
+    WeTvPlaylistIE,
+)
 from .wistia import WistiaIE
 from .worldstarhiphop import WorldStarHipHopIE
 from .wsj import (
--- a/youtube_dl/extractor/wetv.py
+++ b/youtube_dl/extractor/wetv.py
@ -0,0 +1,489 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+import re
+import string
+import time
+
+from ctypes import c_int32
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_chr,
+    compat_ord,
+    compat_str,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_urlencode,
+    compat_urlparse,
+)
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    smuggle_url,
+    std_headers,
+    strip_jsonp,
+    unescapeHTML,
+    unsmuggle_url,
+    url_or_none,
+)
+
+
+class WeTvBaseInfoExtractor(InfoExtractor):
+    @staticmethod
+    def parse_video_info(video_info):
+        thumbnails = []
+        for key, value in video_info.items():
+            if key.startswith('pic_'):
+                try:
+                    width, height = key[4:].split('_')
+                except ValueError:
+                    pass
+                else:
+                    thumbnails.append({
+                        'width': int_or_none(width),
+                        'height': int_or_none(height),
+                        'url': url_or_none(value),
+                    })
+
+        return {
+            'id': video_info['vid'],
+            'title': unescapeHTML(video_info['title']),
+            'description': unescapeHTML(video_info.get('desc')),
+            'duration': parse_duration(video_info.get('duration')),
+            'episode_number': int_or_none(video_info.get('episode')),
+            'thumbnails': thumbnails,
+        }
+
+    def extract_info_from_page(self, webpage, video_id):
+        inputs = self._hidden_inputs(webpage)
+        info = self._parse_json(inputs.get('data_sync', ''), video_id, compat_urllib_parse_unquote)
+
+        return info, info['langInfo']['langId'], info['langInfo']['areaCode']
+
+
+class WeTvIE(WeTvBaseInfoExtractor):
+    IE_NAME = 'wetv'
+    IE_DESC = 'WeTV.vip'
+    _VALID_URL = r'''(?x)
+        (?:
+            wetv:|
+            https?://(?:m\.)?wetv\.vip/
+                (?:(?P<language>[a-z]{2}(?:-[a-z]{2})?)/)?
+                (?:play/){,2}
+                (?:(?P<playlist_id>[a-z\d]{15})(?:-[^/]*)?/)?
+                (?:play\?vid=)?
+        )
+        (?P<id>[a-z\d]{11})
+        (?:$|[^a-z\d])'''
+    _TESTS = [{
+        'url': 'https://wetv.vip/play?vid=o00318x0wds',
+        'md5': '6b07174a61ed9c1dfb8defab3b40ba12',
+        'info_dict': {
+            'id': 'o00318x0wds',
+            'ext': 'mp4',
+            'title': "EP1\uff1aThe King's Avatar",
+        }
+    }, {
+        'url': 'https://wetv.vip/en/play/jenizogwk2t8400/o00318x0wds',
+        'only_matching': True,
+    }, {
+        # user video
+        'url': 'https://wetv.vip/en/play/a3150lwr4jn-Ve-Po-Ad%20Review%20%2F%20HowTo',
+        'md5': 'b053543f584bb4ae10767b5c6bf67807',
+        'info_dict': {
+            'id': 'a3150lwr4jn',
+            'ext': 'mp4',
+            'title': 'Ve-Po-Ad Review / HowTo',
+        }
+    }, {
+        # non-m3u8
+        'url': 'https://wetv.vip/en/play/h31438yuulr',
+        'md5': 'c1b83ac4653d38b2d5e423caeab4148b',
+        'info_dict': {
+            'id': 'h31438yuulr',
+            'ext': 'mp4',
+            'title': 'Gokukoku no Brynhildr - "Ichiban Boshi" \u2014 Full Ending',
+        }
+    }]
+
+    @staticmethod
+    def create_guid():
+        return ''.join([random.choice(string.digits + string.ascii_lowercase) for _ in range(32)])
+
+    def _real_initialize(self):
+        self.ckey = CKey()
+        self.default_quality = 'hd'
+        self.common_params = {
+            'charge': 0,
+            'defaultfmt': 'auto',
+            'otype': 'json',
+            'platform': 4830201,
+            'sdtfrom': 1002,
+            'defnpayver': 0,
+            'appVer': '3.5.57',
+            'sphttps': 1,
+            'spwm': 4,
+            'fhdswitch': 0,
+            'show1080p': 0,
+            'isHLS': 1,
+            'dtype': 3,
+            'sphls': 2,
+            'spgzip': 1,
+            'dlver': 2,
+            'drm': 32,
+            'spau': 1,
+            'spaudio': 15,
+            'spsrt': 1,
+            'spvideo': 16,
+            'defsrc': 2,
+            'encryptVer': '8.1',
+            'fp2p': 1,
+            'spadseg': 1,
+            'guid': WeTvIE.create_guid(),
+            'logintoken': '{"main_login":"","openid":"","appid":"","access_token":"","vuserid":"","vusession":""}',
+        }
+
+    def generate_jsonp_url(self, quality, video_id, url, playlist_id, lang_code, country_code):
+        parsed_url = compat_urlparse.urlparse(url)
+        timestamp = time.time()
+
+        params = self.common_params.copy()
+        params.update({
+            'defn': quality,
+            'vid': video_id,
+            'cid': playlist_id,
+            'lang_code': lang_code,
+            'country_code': country_code,
+            'flowid': '{}_{}'.format(WeTvIE.create_guid(), params['platform']),
+            'host': parsed_url.netloc,
+            'refer': parsed_url.netloc,
+            'ehost': compat_urlparse.urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, None, None, None)),
+            'tm': int(timestamp),
+            '_{}'.format(int(timestamp * 1000)): '',
+            'callback': 'txplayerJsonpCallBack_getinfo_{}'.format(random.randint(10000, 1000000)),
+        })
+        params['cKey'] = self.ckey.make(
+            video_id, compat_str(params['tm']), params['appVer'], params['guid'],
+            compat_str(params['platform']), url, std_headers['User-Agent'])
+
+        return 'https://play.wetv.vip/getvinfo?{}'.format(compat_urllib_parse_urlencode(params))
+
+    def get_jsonp_data(self, quality, video_id, *args):
+        jsonp_url = self.generate_jsonp_url(quality, video_id, *args)
+        # "accept-encoding: gzip" results in
+        # EOFError: Compressed file ended before the end-of-stream marker was reached
+        return self._download_json(jsonp_url, video_id, transform_source=strip_jsonp,
+                                   note='Downloading {} metadata'.format(quality),
+                                   headers={'Accept-Encoding': 'deflate'})
+
+    @staticmethod
+    def extract_qualities(data):
+        qualities = []
+        for quality in data['fl']['fi']:
+            id = quality['name']
+            qualities.append({
+                'format_id': id,
+                'format_note': quality['cname'],
+                'filesize_approx': quality['fs'],
+            })
+
+        return qualities
+
+    @staticmethod
+    def extract_format(data):
+        video_info = data['vl']['vi'][0]
+
+        url = video_info['ul']['ui'][random.randint(0, 2)]['url']
+        if 'fvkey' in video_info:
+            url += '{}?vkey={}'.format(video_info['fn'], video_info['fvkey'])
+        elif url.endswith('/'):
+            url += '{}.m3u8?ver=4'.format(video_info['fn'])
+
+        return {
+            'url': url,
+            'ext': 'mp4',
+            'width': int_or_none(video_info.get('vw')),
+            'height': int_or_none(video_info.get('vh')),
+        }
+
+    def get_formats_and_data(self, *args):
+        formats = []
+
+        default_quality_data = self.get_jsonp_data(self.default_quality, *args)
+        qualities = WeTvIE.extract_qualities(default_quality_data)
+
+        for quality in qualities:
+            id = quality['format_id']
+            if id == self.default_quality:
+                data = default_quality_data
+            else:
+                data = self.get_jsonp_data(id, *args)
+
+            quality.update(WeTvIE.extract_format(data))
+
+            formats.append(quality)
+
+        return formats, default_quality_data
+
+    def _get_subtitles(self, data):
+        subtitles = {}
+        for subtitle_info in data['sfl'].get('fi', []):
+            subtitles[subtitle_info['lang'].lower()] = [{'url': subtitle_info['url']}]
+
+        return subtitles
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url)
+
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        language = mobj.group('language')
+        # urls like this: https://wetv.vip/en/play/jenizogwk2t8400/play?vid=p0031yjo98d
+        # when opened in new tab - load series page (and start playing the first episode)
+        # when clicked on the website - load clicked episode
+        # solution: use https://wetv.vip/en/play/p0031yjo98d instead
+        good_url = 'https://wetv.vip/{}/play/{}'.format(language or 'en', video_id)
+
+        if smuggled_data:
+            # playlist extractor sets the metadata like title, duration, etc.,
+            # no need to load the webpage again
+            result = {}
+            playlist_id = smuggled_data['playlist_id']
+            lang_code = smuggled_data['lang_code']
+            country_code = smuggled_data['country_code']
+        else:
+            webpage = self._download_webpage(good_url, video_id)
+
+            full_url = self._og_search_url(webpage)
+            mobj = re.match(self._VALID_URL, full_url)
+            playlist_id = mobj.group('playlist_id') or ''
+
+            info, lang_code, country_code = self.extract_info_from_page(webpage, video_id)
+            result = WeTvBaseInfoExtractor.parse_video_info(info['videoInfo'])
+
+        formats, data = self.get_formats_and_data(video_id, good_url, playlist_id, lang_code, country_code)
+
+        subtitles = self.extract_subtitles(data)
+
+        result.update({
+            'id': video_id,
+            'url': 'http://example.com/example.mp4',
+            'formats': formats,
+            'subtitles': subtitles,
+        })
+        return result
+
+
+class WeTvPlaylistIE(WeTvBaseInfoExtractor):
+    IE_NAME = 'wetv:playlist'
+    IE_DESC = 'WeTV.vip playlists'
+    _VALID_URL = r'''(?x)
+        https?://(?:m\.)?wetv\.vip/
+            (?:[a-z]{2}(?:-[a-z]{2})?/)?
+            play(?:/|\?cid=)
+            (?P<id>[a-z\d]{15})
+            (?:$|[^a-z\d])'''
+    _TESTS = [{
+        'url': 'https://wetv.vip/en/play/jenizogwk2t8400',
+        'info_dict': {
+            'id': 'jenizogwk2t8400',
+            'title': "The King's Avatar",
+            'description': 'md5:eca1c149133af485673d7676d4eff0c9',
+        },
+        'playlist_count': 40,
+    }, {
+        'url': 'https://wetv.vip/play?cid=jenizogwk2t8400',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        webpage = self._download_webpage(url, playlist_id)
+
+        info, lang_code, country_code = self.extract_info_from_page(webpage, playlist_id)
+
+        entries = []
+        for video_info in info['videoList']:
+            parsed_info = WeTvBaseInfoExtractor.parse_video_info(video_info)
+            smuggled_url = smuggle_url(
+                'wetv:{}'.format(parsed_info['id'], ),
+                {
+                    'playlist_id': playlist_id,
+                    'lang_code': lang_code,
+                    'country_code': country_code,
+                }
+            )
+            parsed_info.update({
+                '_type': 'url_transparent',
+                'url': smuggled_url,
+                'ie_key': WeTvIE.ie_key(),
+            })
+            entries.append(parsed_info)
+
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'title': info.get('coverInfo', {}).get('title'),
+            'description': info.get('coverInfo', {}).get('description'),
+            'entries': entries,
+        }
+
+
+class CKey:
+    def __init__(self):
+        self.encryption_arrays = [[
+            1332468387, -1641050960, 2136896045, -1629555948,
+            1399201960, -850809832, -1307058635, 751381793,
+            -1933648423, 1106735553, -203378700, -550927659,
+            766369351, 1817882502, -1615200142, 1083409063,
+            -104955314, -1780208184, 173944250, 1254993693,
+            1422337688, -1054667952, -880990486, -2119136777,
+            -1822404972, 1380140484, -1723964626, 412019417,
+            -890799303, -1734066435, 26893779, 420787978,
+            -1337058067, 686432784, 695238595, 811911369,
+            -391724567, -1068702727, -381903814, -648522509,
+            -1266234148, 1959407397, -1644776673, 1152313324]]
+        d = [None] * 256
+        f = d.copy()
+        g = d.copy()
+        h = d.copy()
+        j = d.copy()
+        o = d.copy()
+        for i in range(256):
+            o[i] = i << 1 if i < 128 else i << 1 ^ 283
+
+        t = 0
+        u = 0
+        for i in range(256):
+            v = u ^ u << 1 ^ u << 2 ^ u << 3 ^ u << 4
+            v = CKey.rshift(v, 8) ^ 255 & v ^ 99
+            d[t] = v
+            x = o[t]
+            z = o[o[x]]
+            A = CKey.int32(257 * o[v] ^ 16843008 * v)
+            f[t] = CKey.int32(A << 24 | CKey.rshift(A, 8))
+            g[t] = CKey.int32(A << 16 | CKey.rshift(A, 16))
+            h[t] = CKey.int32(A << 8 | CKey.rshift(A, 24))
+            j[t] = A
+            if t == 0:
+                t = 1
+                u = 1
+            else:
+                t = x ^ o[o[o[z ^ x]]]
+                u ^= o[o[u]]
+
+        self.encryption_arrays.append(f)
+        self.encryption_arrays.append(g)
+        self.encryption_arrays.append(h)
+        self.encryption_arrays.append(j)
+        self.encryption_arrays.append(d)
+
+    @staticmethod
+    def rshift(val, n):
+        return (val & 0xFFFFFFFF) >> n
+
+    @staticmethod
+    def int32(val):
+        return c_int32(val).value
+
+    @staticmethod
+    def encode_text(text):
+        length = len(text)
+        arr = [0] * (length // 4)
+        for i in range(length):
+            arr[i // 4] |= (255 & ord(text[i])) << 24 - i % 4 * 8
+        return arr, length
+
+    @staticmethod
+    def decode_text(arr, length):
+        text_array = []
+        for i in range(length):
+            text_array.append('{:02x}'.format(
+                CKey.rshift(arr[i // 4], 24 - i % 4 * 8) & 255))
+
+        return ''.join(text_array)
+
+    @staticmethod
+    def calculate_hash(text):
+        result = 0
+        for char in text:
+            result = CKey.int32(result << 5) - result + compat_ord(char)
+        return compat_str(result)
+
+    @staticmethod
+    def pad_text(text):
+        pad_length = 16 - len(text) % 16
+        return text + compat_chr(pad_length) * pad_length
+
+    def encrypt(self, arr):
+        for i in range(0, len(arr), 4):
+            self.main_algorithm(arr, i)
+
+    def main_algorithm(self, a, b):
+        c, d, e, f, g, h = self.encryption_arrays
+
+        if b == 0:
+            xor_arr = [22039283, 1457920463, 776125350, -1941999367]
+        else:
+            xor_arr = a[b - 4: b]
+
+        for i, val in enumerate(xor_arr):
+            a[b + i] ^= val
+
+        j = a[b] ^ c[0]
+        k = a[b + 1] ^ c[1]
+        l = a[b + 2] ^ c[2]
+        m = a[b + 3] ^ c[3]
+        n = 4
+        for _ in range(9):
+            q = (d[CKey.rshift(j, 24)] ^ e[CKey.rshift(k, 16) & 255]
+                 ^ f[CKey.rshift(l, 8) & 255] ^ g[255 & m] ^ c[n])
+            s = (d[CKey.rshift(k, 24)] ^ e[CKey.rshift(l, 16) & 255]
+                 ^ f[CKey.rshift(m, 8) & 255] ^ g[255 & j] ^ c[n + 1])
+            t = (d[CKey.rshift(l, 24)] ^ e[CKey.rshift(m, 16) & 255]
+                 ^ f[CKey.rshift(j, 8) & 255] ^ g[255 & k] ^ c[n + 2])
+            m = (d[CKey.rshift(m, 24)] ^ e[CKey.rshift(j, 16) & 255]
+                 ^ f[CKey.rshift(k, 8) & 255] ^ g[255 & l] ^ c[n + 3])
+            j = q
+            k = s
+            l = t
+            n += 4
+
+        q = CKey.int32(h[CKey.rshift(j, 24)] << 24
+                       | h[CKey.rshift(k, 16) & 255] << 16
+                       | h[CKey.rshift(l, 8) & 255] << 8
+                       | h[255 & m]) ^ c[n]
+        s = CKey.int32(h[CKey.rshift(k, 24)] << 24
+                       | h[CKey.rshift(l, 16) & 255] << 16
+                       | h[CKey.rshift(m, 8) & 255] << 8
+                       | h[255 & j]) ^ c[n + 1]
+        t = CKey.int32(h[CKey.rshift(l, 24)] << 24
+                       | h[CKey.rshift(m, 16) & 255] << 16
+                       | h[CKey.rshift(j, 8) & 255] << 8
+                       | h[255 & k]) ^ c[n + 2]
+        m = CKey.int32(h[CKey.rshift(m, 24)] << 24
+                       | h[CKey.rshift(j, 16) & 255] << 16
+                       | h[CKey.rshift(k, 8) & 255] << 8
+                       | h[255 & l]) ^ c[n + 3]
+        a[b] = q
+        a[b + 1] = s
+        a[b + 2] = t
+        a[b + 3] = m
+
+    def make(self, vid, tm, app_ver, guid, platform, url,
+             # user_agent is shortened anyway
+             user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.',
+             referer='', nav_code_name='Mozilla',
+             nav_name='Netscape', nav_platform='Win32'):
+        text_parts = [
+            '', vid, tm, 'mg3c3b04ba', app_ver, guid, platform,
+            url[:48], user_agent[:48].lower(), referer[:48],
+            nav_code_name, nav_name, nav_platform, '00', ''
+        ]
+        text_parts.insert(1, CKey.calculate_hash('|'.join(text_parts)))
+
+        text = CKey.pad_text('|'.join(text_parts))
+        [arr, length] = CKey.encode_text(text)
+        self.encrypt(arr)
+        return CKey.decode_text(arr, length).upper()