From cef5fe698a0d571f4b6c3bd35367abc5c759196c Mon Sep 17 00:00:00 2001 From: johnsmith2077 Date: Mon, 5 Oct 2020 12:38:25 +0800 Subject: [PATCH] [acfun] Add new extractor --- youtube_dl/extractor/acfun.py | 220 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 5 + 2 files changed, 225 insertions(+) create mode 100644 youtube_dl/extractor/acfun.py diff --git a/youtube_dl/extractor/acfun.py b/youtube_dl/extractor/acfun.py new file mode 100644 index 000000000..cae8bbe8b --- /dev/null +++ b/youtube_dl/extractor/acfun.py @@ -0,0 +1,220 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import ( + compat_cookiejar, + compat_urllib_parse_urlencode, + compat_urllib_request, +) +from ..utils import ( + int_or_none, + float_or_none, + str_or_none, + str_to_int, + sanitized_Request, + ExtractorError, +) + +class BasicAcfunInfoExtractor(InfoExtractor): + _FAKE_HEADERS = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa + 'Accept-Charset': 'UTF-8,*;q=0.5', + 'Accept-Encoding': 'gzip,deflate,sdch', + 'Accept-Language': 'en-US,en;q=0.8', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', # noqa + } + def _extract_formats(self, currentVideoInfo): + durationMillis = currentVideoInfo.get('durationMillis') + if 'ksPlayJson' in currentVideoInfo: + ksPlayJson = ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] ) + representation = ksPlayJson.get('adaptationSet')[0].get('representation') + + formats = [] + for stream in representation: + size = float_or_none(durationMillis) * stream["avgBitrate"] / 8 + stream_id = stream["qualityLabel"] + quality = stream["qualityType"] + formats += [{ + 'url': stream["url"], + 'ext': 'mp4', + 'width': stream.get('width'), + 'height': stream.get('height'), + 'filesize': size, + }] + formats = formats[::-1] + self._sort_formats(formats) + return formats + +class AcfunIE(BasicAcfunInfoExtractor): + _VALID_URL = r'https?://www\.acfun\.cn/v/ac(?P[_\d]+)' + _TESTS = [ + { + 'url': 'https://www.acfun.cn/v/ac18184362', + 'info_dict': { + 'id': '18184362', + 'ext': 'mp4', + 'duration': 192.042, + 'title': '【AC娘】魔性新单《极乐857》上线!来和AC娘一起云蹦迪吧!', + 'uploader': 'AC娘本体', + 'uploader_id': 23682490 + } + }, + { + 'url': 'https://www.acfun.cn/v/ac17532274_3', + 'info_dict': { + 'id': '17532274_3', + 'ext': 'mp4', + 'duration': 233.770, + 'title': '【AC娘x竾颜音】【周六狂欢24小时】TRAP:七夕恋歌!落入本娘爱的陷阱! - TRAP 阿婵', + 'uploader': 'AC娘本体', + 'uploader_id': 23682490 + } + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, headers=self._FAKE_HEADERS) + + json_text = self._html_search_regex(r'(?s)videoInfo\s*=\s*(\{.*?\});', webpage, 'json_text') + json_data = json.loads(json_text) + + title = json_data['title'] + p_title = self._html_search_regex(r"(.*?)", webpage, 'p_title', default=None) + if p_title: + title = '%s - %s' % (title, p_title) + + uploader = json_data.get('user').get('name') + uploader_id = json_data.get('user').get('id') + + currentVideoInfo = json_data.get('currentVideoInfo') + durationMillis = currentVideoInfo.get('durationMillis') + duration = durationMillis / 1000 + + formats = self._extract_formats(currentVideoInfo) + return { + 'id': video_id, + 'uploader_id': str_to_int(uploader_id), + 'title': title, + 'uploader': str_or_none(uploader), + 'duration': float_or_none(duration), + 'formats': formats + } + + +class AcfunBangumiIE(BasicAcfunInfoExtractor): + _VALID_URL = r'https?://www\.acfun\.cn/bangumi/aa(?P[_\d]+)' + _TEST = { + 'url': 'https://www.acfun.cn/bangumi/aa6002917_36188_1748679', + 'info_dict': { + 'id': '6002917_36188_1748679', + 'ext': 'mp4', + 'duration': 1437.076, + 'title': '租借女友 第12话 告白和女友', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, headers=self._FAKE_HEADERS) + + json_text = self._html_search_regex(r'(?s)bangumiData\s*=\s*(\{.*?\});', webpage, 'json_text') + json_data = json.loads(json_text) + + title = json_data.get('showTitle') or json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title'] + + currentVideoInfo = json_data.get('currentVideoInfo') + durationMillis = currentVideoInfo.get('durationMillis') + duration = durationMillis / 1000 + + formats = self._extract_formats(currentVideoInfo) + return { + 'id': video_id, + 'title': title, + 'duration': float_or_none(duration), + 'formats': formats + } + +class AcfunLiveIE(BasicAcfunInfoExtractor): + _VALID_URL = r'https?://live\.acfun\.cn/live/(?P\d+)' + _TEST = { + 'url': 'https://live.acfun.cn/live/36782183', + 'only_matching': True, + 'info_dict': { + 'id': '36782183', + 'ext': 'mp4', + # 'title': '看见兔兔就烦!', + 'is_live': True, + } + } + + def _real_extract(self, url): + live_id = self._match_id(url) + self._FAKE_HEADERS.update({ + 'Referer': url + }) + + # Firstly get _did cookie + fisrt_req = sanitized_Request(url, headers=self._FAKE_HEADERS) + first_res = compat_urllib_request.urlopen(fisrt_req) + + for header_name, header_value in first_res.info().items(): + if header_name.lower() == 'set-cookie': + cookies = header_value + if not cookies: + raise ExtractorError('Fail to fetch cookies') + + cookies_dict = dict(c.strip(' ,').split('=', 1) for c in cookies.split(';')) + did_cookie = cookies_dict['_did'] + + self._FAKE_HEADERS.update({ + 'Cookie': '_did=%s' % did_cookie + }) + + # Login to get userId and acfun.api.visitor_st + login_data = compat_urllib_parse_urlencode({'sid': 'acfun.api.visitor'}).encode('ascii') + login_json = self._download_json( + 'https://id.app.acfun.cn/rest/app/visitor/login', + live_id, + data=login_data, + headers=self._FAKE_HEADERS) + + streams_url = "https://api.kuaishouzt.com/rest/zt/live/web/startPlay?subBiz=mainApp&kpn=ACFUN_APP&kpf=PC_WEB&userId=%d&did=%s&acfun.api.visitor_st=%s" % ( + login_json['userId'], + did_cookie, login_json['acfun.api.visitor_st']) + + # Fetch stream lists + fetch_streams_data = compat_urllib_parse_urlencode({ + 'authorId': int_or_none(live_id), + 'pullStreamType': 'FLV' + }).encode('ascii') + + streams_json = self._download_json( + streams_url, + live_id, + data=fetch_streams_data, + headers=self._FAKE_HEADERS) + + # print(streams_json) + title = streams_json['data']['caption'] + streams_info = json.loads(streams_json['data']['videoPlayRes']) # streams info + representation = streams_info['liveAdaptiveManifest'][0]['adaptationSet']['representation'] + + formats = [] + for stream in representation: + quality = stream["qualityType"] + formats += [{ + 'url': stream["url"], + 'ext': 'mp4', + 'tbr': stream.get('bitrate'), + }] + self._sort_formats(formats) + return { + 'id': live_id, + 'title': self._live_title(title), + 'formats': formats, + 'is_live': True + } \ No newline at end of file diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ae7079a6a..7023ca3d9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -18,6 +18,11 @@ from .acast import ( ACastIE, ACastChannelIE, ) +from .acfun import ( + AcfunIE, + AcfunBangumiIE, + AcfunLiveIE, +) from .adn import ADNIE from .adobeconnect import AdobeConnectIE from .adobetv import (