From cef5fe698a0d571f4b6c3bd35367abc5c759196c Mon Sep 17 00:00:00 2001 From: johnsmith2077 Date: Mon, 5 Oct 2020 12:38:25 +0800 Subject: [PATCH 1/7] [acfun] Add new extractor --- youtube_dl/extractor/acfun.py | 220 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 5 + 2 files changed, 225 insertions(+) create mode 100644 youtube_dl/extractor/acfun.py diff --git a/youtube_dl/extractor/acfun.py b/youtube_dl/extractor/acfun.py new file mode 100644 index 000000000..cae8bbe8b --- /dev/null +++ b/youtube_dl/extractor/acfun.py @@ -0,0 +1,220 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import ( + compat_cookiejar, + compat_urllib_parse_urlencode, + compat_urllib_request, +) +from ..utils import ( + int_or_none, + float_or_none, + str_or_none, + str_to_int, + sanitized_Request, + ExtractorError, +) + +class BasicAcfunInfoExtractor(InfoExtractor): + _FAKE_HEADERS = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa + 'Accept-Charset': 'UTF-8,*;q=0.5', + 'Accept-Encoding': 'gzip,deflate,sdch', + 'Accept-Language': 'en-US,en;q=0.8', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', # noqa + } + def _extract_formats(self, currentVideoInfo): + durationMillis = currentVideoInfo.get('durationMillis') + if 'ksPlayJson' in currentVideoInfo: + ksPlayJson = ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] ) + representation = ksPlayJson.get('adaptationSet')[0].get('representation') + + formats = [] + for stream in representation: + size = float_or_none(durationMillis) * stream["avgBitrate"] / 8 + stream_id = stream["qualityLabel"] + quality = stream["qualityType"] + formats += [{ + 'url': stream["url"], + 'ext': 'mp4', + 'width': stream.get('width'), + 'height': stream.get('height'), + 'filesize': size, + }] + formats = formats[::-1] + self._sort_formats(formats) + return formats + +class AcfunIE(BasicAcfunInfoExtractor): + _VALID_URL = r'https?://www\.acfun\.cn/v/ac(?P[_\d]+)' + _TESTS = [ + { + 'url': 'https://www.acfun.cn/v/ac18184362', + 'info_dict': { + 'id': '18184362', + 'ext': 'mp4', + 'duration': 192.042, + 'title': '【AC娘】魔性新单《极乐857》上线!来和AC娘一起云蹦迪吧!', + 'uploader': 'AC娘本体', + 'uploader_id': 23682490 + } + }, + { + 'url': 'https://www.acfun.cn/v/ac17532274_3', + 'info_dict': { + 'id': '17532274_3', + 'ext': 'mp4', + 'duration': 233.770, + 'title': '【AC娘x竾颜音】【周六狂欢24小时】TRAP:七夕恋歌!落入本娘爱的陷阱! - TRAP 阿婵', + 'uploader': 'AC娘本体', + 'uploader_id': 23682490 + } + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, headers=self._FAKE_HEADERS) + + json_text = self._html_search_regex(r'(?s)videoInfo\s*=\s*(\{.*?\});', webpage, 'json_text') + json_data = json.loads(json_text) + + title = json_data['title'] + p_title = self._html_search_regex(r"(.*?)", webpage, 'p_title', default=None) + if p_title: + title = '%s - %s' % (title, p_title) + + uploader = json_data.get('user').get('name') + uploader_id = json_data.get('user').get('id') + + currentVideoInfo = json_data.get('currentVideoInfo') + durationMillis = currentVideoInfo.get('durationMillis') + duration = durationMillis / 1000 + + formats = self._extract_formats(currentVideoInfo) + return { + 'id': video_id, + 'uploader_id': str_to_int(uploader_id), + 'title': title, + 'uploader': str_or_none(uploader), + 'duration': float_or_none(duration), + 'formats': formats + } + + +class AcfunBangumiIE(BasicAcfunInfoExtractor): + _VALID_URL = r'https?://www\.acfun\.cn/bangumi/aa(?P[_\d]+)' + _TEST = { + 'url': 'https://www.acfun.cn/bangumi/aa6002917_36188_1748679', + 'info_dict': { + 'id': '6002917_36188_1748679', + 'ext': 'mp4', + 'duration': 1437.076, + 'title': '租借女友 第12话 告白和女友', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, headers=self._FAKE_HEADERS) + + json_text = self._html_search_regex(r'(?s)bangumiData\s*=\s*(\{.*?\});', webpage, 'json_text') + json_data = json.loads(json_text) + + title = json_data.get('showTitle') or json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title'] + + currentVideoInfo = json_data.get('currentVideoInfo') + durationMillis = currentVideoInfo.get('durationMillis') + duration = durationMillis / 1000 + + formats = self._extract_formats(currentVideoInfo) + return { + 'id': video_id, + 'title': title, + 'duration': float_or_none(duration), + 'formats': formats + } + +class AcfunLiveIE(BasicAcfunInfoExtractor): + _VALID_URL = r'https?://live\.acfun\.cn/live/(?P\d+)' + _TEST = { + 'url': 'https://live.acfun.cn/live/36782183', + 'only_matching': True, + 'info_dict': { + 'id': '36782183', + 'ext': 'mp4', + # 'title': '看见兔兔就烦!', + 'is_live': True, + } + } + + def _real_extract(self, url): + live_id = self._match_id(url) + self._FAKE_HEADERS.update({ + 'Referer': url + }) + + # Firstly get _did cookie + fisrt_req = sanitized_Request(url, headers=self._FAKE_HEADERS) + first_res = compat_urllib_request.urlopen(fisrt_req) + + for header_name, header_value in first_res.info().items(): + if header_name.lower() == 'set-cookie': + cookies = header_value + if not cookies: + raise ExtractorError('Fail to fetch cookies') + + cookies_dict = dict(c.strip(' ,').split('=', 1) for c in cookies.split(';')) + did_cookie = cookies_dict['_did'] + + self._FAKE_HEADERS.update({ + 'Cookie': '_did=%s' % did_cookie + }) + + # Login to get userId and acfun.api.visitor_st + login_data = compat_urllib_parse_urlencode({'sid': 'acfun.api.visitor'}).encode('ascii') + login_json = self._download_json( + 'https://id.app.acfun.cn/rest/app/visitor/login', + live_id, + data=login_data, + headers=self._FAKE_HEADERS) + + streams_url = "https://api.kuaishouzt.com/rest/zt/live/web/startPlay?subBiz=mainApp&kpn=ACFUN_APP&kpf=PC_WEB&userId=%d&did=%s&acfun.api.visitor_st=%s" % ( + login_json['userId'], + did_cookie, login_json['acfun.api.visitor_st']) + + # Fetch stream lists + fetch_streams_data = compat_urllib_parse_urlencode({ + 'authorId': int_or_none(live_id), + 'pullStreamType': 'FLV' + }).encode('ascii') + + streams_json = self._download_json( + streams_url, + live_id, + data=fetch_streams_data, + headers=self._FAKE_HEADERS) + + # print(streams_json) + title = streams_json['data']['caption'] + streams_info = json.loads(streams_json['data']['videoPlayRes']) # streams info + representation = streams_info['liveAdaptiveManifest'][0]['adaptationSet']['representation'] + + formats = [] + for stream in representation: + quality = stream["qualityType"] + formats += [{ + 'url': stream["url"], + 'ext': 'mp4', + 'tbr': stream.get('bitrate'), + }] + self._sort_formats(formats) + return { + 'id': live_id, + 'title': self._live_title(title), + 'formats': formats, + 'is_live': True + } \ No newline at end of file diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ae7079a6a..7023ca3d9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -18,6 +18,11 @@ from .acast import ( ACastIE, ACastChannelIE, ) +from .acfun import ( + AcfunIE, + AcfunBangumiIE, + AcfunLiveIE, +) from .adn import ADNIE from .adobeconnect import AdobeConnectIE from .adobetv import ( From 6bbb9d0ae886a36739e0cf146e5bb1734e832a12 Mon Sep 17 00:00:00 2001 From: johnsmith2077 Date: Mon, 5 Oct 2020 18:14:42 +0800 Subject: [PATCH 2/7] [acfun] Re-format code to pass flake8 --- youtube_dl/extractor/acfun.py | 269 ++++++++++++++++++---------------- 1 file changed, 144 insertions(+), 125 deletions(-) diff --git a/youtube_dl/extractor/acfun.py b/youtube_dl/extractor/acfun.py index cae8bbe8b..bf4945ecc 100644 --- a/youtube_dl/extractor/acfun.py +++ b/youtube_dl/extractor/acfun.py @@ -5,7 +5,6 @@ import json from .common import InfoExtractor from ..compat import ( - compat_cookiejar, compat_urllib_parse_urlencode, compat_urllib_request, ) @@ -18,203 +17,223 @@ from ..utils import ( ExtractorError, ) + class BasicAcfunInfoExtractor(InfoExtractor): _FAKE_HEADERS = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa - 'Accept-Charset': 'UTF-8,*;q=0.5', - 'Accept-Encoding': 'gzip,deflate,sdch', - 'Accept-Language': 'en-US,en;q=0.8', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', # noqa - } + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", # noqa + "Accept-Charset": "UTF-8,*;q=0.5", + "Accept-Encoding": "gzip,deflate,sdch", + "Accept-Language": "en-US,en;q=0.8", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0", # noqa + } + def _extract_formats(self, currentVideoInfo): - durationMillis = currentVideoInfo.get('durationMillis') - if 'ksPlayJson' in currentVideoInfo: - ksPlayJson = ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] ) - representation = ksPlayJson.get('adaptationSet')[0].get('representation') + durationMillis = currentVideoInfo.get("durationMillis") + if "ksPlayJson" in currentVideoInfo: + ksPlayJson = ksPlayJson = json.loads(currentVideoInfo["ksPlayJson"]) + representation = ksPlayJson.get("adaptationSet")[0].get("representation") formats = [] for stream in representation: size = float_or_none(durationMillis) * stream["avgBitrate"] / 8 - stream_id = stream["qualityLabel"] - quality = stream["qualityType"] - formats += [{ - 'url': stream["url"], - 'ext': 'mp4', - 'width': stream.get('width'), - 'height': stream.get('height'), - 'filesize': size, - }] - formats = formats[::-1] + formats += [ + { + "url": stream["url"], + "ext": "mp4", + "width": stream.get("width"), + "height": stream.get("height"), + "filesize": size, + } + ] + formats = formats[::-1] self._sort_formats(formats) - return formats + return formats + class AcfunIE(BasicAcfunInfoExtractor): - _VALID_URL = r'https?://www\.acfun\.cn/v/ac(?P[_\d]+)' + _VALID_URL = r"https?://www\.acfun\.cn/v/ac(?P[_\d]+)" _TESTS = [ { - 'url': 'https://www.acfun.cn/v/ac18184362', - 'info_dict': { - 'id': '18184362', - 'ext': 'mp4', - 'duration': 192.042, - 'title': '【AC娘】魔性新单《极乐857》上线!来和AC娘一起云蹦迪吧!', - 'uploader': 'AC娘本体', - 'uploader_id': 23682490 - } + "url": "https://www.acfun.cn/v/ac18184362", + "info_dict": { + "id": "18184362", + "ext": "mp4", + "duration": 192.042, + "title": "【AC娘】魔性新单《极乐857》上线!来和AC娘一起云蹦迪吧!", + "uploader": "AC娘本体", + "uploader_id": 23682490, + }, }, { - 'url': 'https://www.acfun.cn/v/ac17532274_3', - 'info_dict': { - 'id': '17532274_3', - 'ext': 'mp4', - 'duration': 233.770, - 'title': '【AC娘x竾颜音】【周六狂欢24小时】TRAP:七夕恋歌!落入本娘爱的陷阱! - TRAP 阿婵', - 'uploader': 'AC娘本体', - 'uploader_id': 23682490 - } - } - ] + "url": "https://www.acfun.cn/v/ac17532274_3", + "info_dict": { + "id": "17532274_3", + "ext": "mp4", + "duration": 233.770, + "title": "【AC娘x竾颜音】【周六狂欢24小时】TRAP:七夕恋歌!落入本娘爱的陷阱! - TRAP 阿婵", + "uploader": "AC娘本体", + "uploader_id": 23682490, + }, + }, + ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id, headers=self._FAKE_HEADERS) - json_text = self._html_search_regex(r'(?s)videoInfo\s*=\s*(\{.*?\});', webpage, 'json_text') + json_text = self._html_search_regex( + r"(?s)videoInfo\s*=\s*(\{.*?\});", webpage, "json_text" + ) json_data = json.loads(json_text) - title = json_data['title'] - p_title = self._html_search_regex(r"(.*?)", webpage, 'p_title', default=None) + title = json_data["title"] + p_title = self._html_search_regex( + r"(.*?)", + webpage, + "p_title", + default=None, + ) if p_title: - title = '%s - %s' % (title, p_title) + title = "%s - %s" % (title, p_title) - uploader = json_data.get('user').get('name') - uploader_id = json_data.get('user').get('id') + uploader = json_data.get("user").get("name") + uploader_id = json_data.get("user").get("id") + + currentVideoInfo = json_data.get("currentVideoInfo") + durationMillis = currentVideoInfo.get("durationMillis") + duration = durationMillis / 1000 - currentVideoInfo = json_data.get('currentVideoInfo') - durationMillis = currentVideoInfo.get('durationMillis') - duration = durationMillis / 1000 - formats = self._extract_formats(currentVideoInfo) return { - 'id': video_id, - 'uploader_id': str_to_int(uploader_id), - 'title': title, - 'uploader': str_or_none(uploader), - 'duration': float_or_none(duration), - 'formats': formats + "id": video_id, + "uploader_id": str_to_int(uploader_id), + "title": title, + "uploader": str_or_none(uploader), + "duration": float_or_none(duration), + "formats": formats, } class AcfunBangumiIE(BasicAcfunInfoExtractor): - _VALID_URL = r'https?://www\.acfun\.cn/bangumi/aa(?P[_\d]+)' + _VALID_URL = r"https?://www\.acfun\.cn/bangumi/aa(?P[_\d]+)" _TEST = { - 'url': 'https://www.acfun.cn/bangumi/aa6002917_36188_1748679', - 'info_dict': { - 'id': '6002917_36188_1748679', - 'ext': 'mp4', - 'duration': 1437.076, - 'title': '租借女友 第12话 告白和女友', - } + "url": "https://www.acfun.cn/bangumi/aa6002917_36188_1748679", + "info_dict": { + "id": "6002917_36188_1748679", + "ext": "mp4", + "duration": 1437.076, + "title": "租借女友 第12话 告白和女友", + }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id, headers=self._FAKE_HEADERS) - json_text = self._html_search_regex(r'(?s)bangumiData\s*=\s*(\{.*?\});', webpage, 'json_text') + json_text = self._html_search_regex( + r"(?s)bangumiData\s*=\s*(\{.*?\});", webpage, "json_text" + ) json_data = json.loads(json_text) - title = json_data.get('showTitle') or json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title'] + title = ( + json_data.get("showTitle") + or json_data["bangumiTitle"] + + " " + + json_data["episodeName"] + + " " + + json_data["title"] + ) - currentVideoInfo = json_data.get('currentVideoInfo') - durationMillis = currentVideoInfo.get('durationMillis') - duration = durationMillis / 1000 + currentVideoInfo = json_data.get("currentVideoInfo") + durationMillis = currentVideoInfo.get("durationMillis") + duration = durationMillis / 1000 formats = self._extract_formats(currentVideoInfo) return { - 'id': video_id, - 'title': title, - 'duration': float_or_none(duration), - 'formats': formats + "id": video_id, + "title": title, + "duration": float_or_none(duration), + "formats": formats, } + class AcfunLiveIE(BasicAcfunInfoExtractor): - _VALID_URL = r'https?://live\.acfun\.cn/live/(?P\d+)' + _VALID_URL = r"https?://live\.acfun\.cn/live/(?P\d+)" _TEST = { - 'url': 'https://live.acfun.cn/live/36782183', - 'only_matching': True, - 'info_dict': { - 'id': '36782183', - 'ext': 'mp4', + "url": "https://live.acfun.cn/live/36782183", + "only_matching": True, + "info_dict": { + "id": "36782183", + "ext": "mp4", # 'title': '看见兔兔就烦!', - 'is_live': True, - } + "is_live": True, + }, } def _real_extract(self, url): live_id = self._match_id(url) - self._FAKE_HEADERS.update({ - 'Referer': url - }) + self._FAKE_HEADERS.update({"Referer": url}) # Firstly get _did cookie fisrt_req = sanitized_Request(url, headers=self._FAKE_HEADERS) first_res = compat_urllib_request.urlopen(fisrt_req) for header_name, header_value in first_res.info().items(): - if header_name.lower() == 'set-cookie': + if header_name.lower() == "set-cookie": cookies = header_value if not cookies: - raise ExtractorError('Fail to fetch cookies') + raise ExtractorError("Fail to fetch cookies") - cookies_dict = dict(c.strip(' ,').split('=', 1) for c in cookies.split(';')) - did_cookie = cookies_dict['_did'] + cookies_dict = dict(c.strip(" ,").split("=", 1) for c in cookies.split(";")) + did_cookie = cookies_dict["_did"] - self._FAKE_HEADERS.update({ - 'Cookie': '_did=%s' % did_cookie - }) + self._FAKE_HEADERS.update({"Cookie": "_did=%s" % did_cookie}) # Login to get userId and acfun.api.visitor_st - login_data = compat_urllib_parse_urlencode({'sid': 'acfun.api.visitor'}).encode('ascii') + login_data = compat_urllib_parse_urlencode({"sid": "acfun.api.visitor"}).encode( + "ascii" + ) login_json = self._download_json( - 'https://id.app.acfun.cn/rest/app/visitor/login', - live_id, - data=login_data, - headers=self._FAKE_HEADERS) + "https://id.app.acfun.cn/rest/app/visitor/login", + live_id, + data=login_data, + headers=self._FAKE_HEADERS, + ) - streams_url = "https://api.kuaishouzt.com/rest/zt/live/web/startPlay?subBiz=mainApp&kpn=ACFUN_APP&kpf=PC_WEB&userId=%d&did=%s&acfun.api.visitor_st=%s" % ( - login_json['userId'], - did_cookie, login_json['acfun.api.visitor_st']) + streams_url = ( + "https://api.kuaishouzt.com/rest/zt/live/web/startPlay?subBiz=mainApp&kpn=ACFUN_APP&kpf=PC_WEB&userId=%d&did=%s&acfun.api.visitor_st=%s" + % (login_json["userId"], did_cookie, login_json["acfun.api.visitor_st"]) + ) # Fetch stream lists - fetch_streams_data = compat_urllib_parse_urlencode({ - 'authorId': int_or_none(live_id), - 'pullStreamType': 'FLV' - }).encode('ascii') + fetch_streams_data = compat_urllib_parse_urlencode( + {"authorId": int_or_none(live_id), "pullStreamType": "FLV"} + ).encode("ascii") streams_json = self._download_json( - streams_url, - live_id, - data=fetch_streams_data, - headers=self._FAKE_HEADERS) + streams_url, live_id, data=fetch_streams_data, headers=self._FAKE_HEADERS + ) # print(streams_json) - title = streams_json['data']['caption'] - streams_info = json.loads(streams_json['data']['videoPlayRes']) # streams info - representation = streams_info['liveAdaptiveManifest'][0]['adaptationSet']['representation'] - + title = streams_json["data"]["caption"] + streams_info = json.loads(streams_json["data"]["videoPlayRes"]) # streams info + representation = streams_info["liveAdaptiveManifest"][0]["adaptationSet"][ + "representation" + ] + formats = [] for stream in representation: - quality = stream["qualityType"] - formats += [{ - 'url': stream["url"], - 'ext': 'mp4', - 'tbr': stream.get('bitrate'), - }] + formats += [ + { + "url": stream["url"], + "ext": "mp4", + "tbr": stream.get("bitrate"), + } + ] self._sort_formats(formats) return { - 'id': live_id, - 'title': self._live_title(title), - 'formats': formats, - 'is_live': True - } \ No newline at end of file + "id": live_id, + "title": self._live_title(title), + "formats": formats, + "is_live": True, + } From 1758030b08c6f671c26a79778022cda511b144dd Mon Sep 17 00:00:00 2001 From: johnsmith2077 Date: Mon, 5 Oct 2020 18:42:49 +0800 Subject: [PATCH 3/7] [acfun] Add error prompt --- youtube_dl/extractor/acfun.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/acfun.py b/youtube_dl/extractor/acfun.py index bf4945ecc..fab83a4e3 100644 --- a/youtube_dl/extractor/acfun.py +++ b/youtube_dl/extractor/acfun.py @@ -101,7 +101,7 @@ class AcfunIE(BasicAcfunInfoExtractor): currentVideoInfo = json_data.get("currentVideoInfo") durationMillis = currentVideoInfo.get("durationMillis") - duration = durationMillis / 1000 + duration = float_or_none(durationMillis) / 1000.0 formats = self._extract_formats(currentVideoInfo) return { @@ -146,7 +146,7 @@ class AcfunBangumiIE(BasicAcfunInfoExtractor): currentVideoInfo = json_data.get("currentVideoInfo") durationMillis = currentVideoInfo.get("durationMillis") - duration = durationMillis / 1000 + duration = float_or_none(durationMillis) / 1000.0 formats = self._extract_formats(currentVideoInfo) return { @@ -214,7 +214,11 @@ class AcfunLiveIE(BasicAcfunInfoExtractor): streams_url, live_id, data=fetch_streams_data, headers=self._FAKE_HEADERS ) - # print(streams_json) + try: + assert "data" in streams_json + except AssertionError: + raise ExtractorError("This live room is currently closed") + title = streams_json["data"]["caption"] streams_info = json.loads(streams_json["data"]["videoPlayRes"]) # streams info representation = streams_info["liveAdaptiveManifest"][0]["adaptationSet"][ From 345bd3b0264ad82f5a91c3d027bb49b6c1d6f7ee Mon Sep 17 00:00:00 2001 From: johnsmith2077 Date: Wed, 7 Oct 2020 02:28:01 +0800 Subject: [PATCH 4/7] [acfun] Add playlist support for normal video --- youtube_dl/extractor/acfun.py | 65 ++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/acfun.py b/youtube_dl/extractor/acfun.py index fab83a4e3..004898990 100644 --- a/youtube_dl/extractor/acfun.py +++ b/youtube_dl/extractor/acfun.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor from ..compat import ( @@ -51,9 +52,10 @@ class BasicAcfunInfoExtractor(InfoExtractor): class AcfunIE(BasicAcfunInfoExtractor): - _VALID_URL = r"https?://www\.acfun\.cn/v/ac(?P[_\d]+)" + _VALID_URL = r"https?://www\.acfun\.cn/v/ac(?P\d+)(?P[_\d]+)?" _TESTS = [ { + "note": "single video without playlist", "url": "https://www.acfun.cn/v/ac18184362", "info_dict": { "id": "18184362", @@ -65,9 +67,10 @@ class AcfunIE(BasicAcfunInfoExtractor): }, }, { + "note": "single video in playlist", "url": "https://www.acfun.cn/v/ac17532274_3", "info_dict": { - "id": "17532274_3", + "id": "17532274", "ext": "mp4", "duration": 233.770, "title": "【AC娘x竾颜音】【周六狂欢24小时】TRAP:七夕恋歌!落入本娘爱的陷阱! - TRAP 阿婵", @@ -75,30 +78,68 @@ class AcfunIE(BasicAcfunInfoExtractor): "uploader_id": 23682490, }, }, + { + "note": "multiple video with playlist", + "url": "https://www.acfun.cn/v/ac17532274", + "info_dict": { + "id": "17532274", + "title": "【AC娘x竾颜音】【周六狂欢24小时】TRAP:七夕恋歌!落入本娘爱的陷阱!", + "uploader": "AC娘本体", + "uploader_id": 23682490, + }, + "playlist_count": 5 + } ] def _real_extract(self, url): - video_id = self._match_id(url) + video_id, page_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, video_id, headers=self._FAKE_HEADERS) json_text = self._html_search_regex( r"(?s)videoInfo\s*=\s*(\{.*?\});", webpage, "json_text" ) - json_data = json.loads(json_text) + json_data = json.loads(json_text) title = json_data["title"] + + uploader = str_or_none(json_data.get("user").get("name")) + uploader_id = str_to_int(json_data.get("user").get("id")) + + videoList = json_data.get('videoList') + if videoList: + video_num = len(videoList) + + if not page_id and video_num and video_num > 1: + if not self._downloader.params.get('noplaylist'): + self.to_screen('Downloading all pages %s - add --no-playlist to just download video' % video_id) + entries = [self.url_result( + '%s_%d' % (url, pid), + self.IE_NAME, + video_id='%s_%d' % (video_id, pid)) + for pid in range(1, video_num+1)] + playlist = self.playlist_result(entries, video_id, title) + playlist.update({ + 'uploader': uploader, + 'uploader_id': uploader_id, + }) + return playlist + + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + p_title = self._html_search_regex( r"(.*?)", webpage, "p_title", default=None, - ) + ) + if p_title: - title = "%s - %s" % (title, p_title) - - uploader = json_data.get("user").get("name") - uploader_id = json_data.get("user").get("id") + title = "%s-%s" % (title, p_title) + if page_id: + video_id += page_id + currentVideoInfo = json_data.get("currentVideoInfo") durationMillis = currentVideoInfo.get("durationMillis") duration = float_or_none(durationMillis) / 1000.0 @@ -106,10 +147,10 @@ class AcfunIE(BasicAcfunInfoExtractor): formats = self._extract_formats(currentVideoInfo) return { "id": video_id, - "uploader_id": str_to_int(uploader_id), + "uploader_id": uploader_id, "title": title, - "uploader": str_or_none(uploader), - "duration": float_or_none(duration), + "uploader": uploader, + "duration": duration, "formats": formats, } From d96564c350e56eecc35feefa371ba781dd0793db Mon Sep 17 00:00:00 2001 From: johnsmith2077 Date: Wed, 7 Oct 2020 02:39:21 +0800 Subject: [PATCH 5/7] [acfun] Fix test cases --- youtube_dl/extractor/acfun.py | 60 ++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/acfun.py b/youtube_dl/extractor/acfun.py index 004898990..5e97f7603 100644 --- a/youtube_dl/extractor/acfun.py +++ b/youtube_dl/extractor/acfun.py @@ -70,10 +70,10 @@ class AcfunIE(BasicAcfunInfoExtractor): "note": "single video in playlist", "url": "https://www.acfun.cn/v/ac17532274_3", "info_dict": { - "id": "17532274", + "id": "17532274_3", "ext": "mp4", "duration": 233.770, - "title": "【AC娘x竾颜音】【周六狂欢24小时】TRAP:七夕恋歌!落入本娘爱的陷阱! - TRAP 阿婵", + "title": "【AC娘x竾颜音】【周六狂欢24小时】TRAP:七夕恋歌!落入本娘爱的陷阱!-TRAP 阿婵", "uploader": "AC娘本体", "uploader_id": 23682490, }, @@ -87,8 +87,8 @@ class AcfunIE(BasicAcfunInfoExtractor): "uploader": "AC娘本体", "uploader_id": 23682490, }, - "playlist_count": 5 - } + "playlist_count": 5, + }, ] def _real_extract(self, url): @@ -99,47 +99,57 @@ class AcfunIE(BasicAcfunInfoExtractor): json_text = self._html_search_regex( r"(?s)videoInfo\s*=\s*(\{.*?\});", webpage, "json_text" ) - json_data = json.loads(json_text) + json_data = json.loads(json_text) title = json_data["title"] uploader = str_or_none(json_data.get("user").get("name")) - uploader_id = str_to_int(json_data.get("user").get("id")) + uploader_id = str_to_int(json_data.get("user").get("id")) - videoList = json_data.get('videoList') + videoList = json_data.get("videoList") if videoList: video_num = len(videoList) - + if not page_id and video_num and video_num > 1: - if not self._downloader.params.get('noplaylist'): - self.to_screen('Downloading all pages %s - add --no-playlist to just download video' % video_id) - entries = [self.url_result( - '%s_%d' % (url, pid), - self.IE_NAME, - video_id='%s_%d' % (video_id, pid)) - for pid in range(1, video_num+1)] + if not self._downloader.params.get("noplaylist"): + self.to_screen( + "Downloading all pages %s - add --no-playlist to just download video" + % video_id + ) + entries = [ + self.url_result( + "%s_%d" % (url, pid), + self.IE_NAME, + video_id="%s_%d" % (video_id, pid), + ) + for pid in range(1, video_num + 1) + ] playlist = self.playlist_result(entries, video_id, title) - playlist.update({ - 'uploader': uploader, - 'uploader_id': uploader_id, - }) + playlist.update( + { + "uploader": uploader, + "uploader_id": uploader_id, + } + ) return playlist - - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + + self.to_screen( + "Downloading just video %s because of --no-playlist" % video_id + ) p_title = self._html_search_regex( r"(.*?)", webpage, "p_title", default=None, - ) + ) if p_title: - title = "%s-%s" % (title, p_title) + title = "%s-%s" % (title, p_title) if page_id: - video_id += page_id - + video_id += page_id + currentVideoInfo = json_data.get("currentVideoInfo") durationMillis = currentVideoInfo.get("durationMillis") duration = float_or_none(durationMillis) / 1000.0 From 165ff0aab2a511fdfad8ed2102e124bb3b0b2049 Mon Sep 17 00:00:00 2001 From: johnsmith2077 Date: Wed, 7 Oct 2020 14:38:35 +0800 Subject: [PATCH 6/7] [acfun] Add playlist support for bangumi --- youtube_dl/extractor/acfun.py | 155 +++++++++++++++++++++++----------- 1 file changed, 105 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/acfun.py b/youtube_dl/extractor/acfun.py index 5e97f7603..7be5db967 100644 --- a/youtube_dl/extractor/acfun.py +++ b/youtube_dl/extractor/acfun.py @@ -1,20 +1,18 @@ # coding: utf-8 from __future__ import unicode_literals +import time import json import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_urllib_request, -) from ..utils import ( int_or_none, float_or_none, str_or_none, str_to_int, sanitized_Request, + urlencode_postdata, ExtractorError, ) @@ -36,7 +34,8 @@ class BasicAcfunInfoExtractor(InfoExtractor): formats = [] for stream in representation: - size = float_or_none(durationMillis) * stream["avgBitrate"] / 8 + avgByterate = float_or_none(stream.get("avgBitrate"), 8) + size = float_or_none(durationMillis, invscale=avgByterate) formats += [ { "url": stream["url"], @@ -79,7 +78,7 @@ class AcfunIE(BasicAcfunInfoExtractor): }, }, { - "note": "multiple video with playlist", + "note": "multiple video within playlist", "url": "https://www.acfun.cn/v/ac17532274", "info_dict": { "id": "17532274", @@ -113,8 +112,8 @@ class AcfunIE(BasicAcfunInfoExtractor): if not page_id and video_num and video_num > 1: if not self._downloader.params.get("noplaylist"): self.to_screen( - "Downloading all pages %s - add --no-playlist to just download video" - % video_id + "Downloading all pages of %s(ac%s) - add --no-playlist to just download video" + % (title, video_id) ) entries = [ self.url_result( @@ -134,7 +133,8 @@ class AcfunIE(BasicAcfunInfoExtractor): return playlist self.to_screen( - "Downloading just video %s because of --no-playlist" % video_id + "Downloading just video %s(ac%s) because of --no-playlist" + % (title, video_id) ) p_title = self._html_search_regex( @@ -152,7 +152,7 @@ class AcfunIE(BasicAcfunInfoExtractor): currentVideoInfo = json_data.get("currentVideoInfo") durationMillis = currentVideoInfo.get("durationMillis") - duration = float_or_none(durationMillis) / 1000.0 + duration = float_or_none(durationMillis, 1000) formats = self._extract_formats(currentVideoInfo) return { @@ -166,44 +166,93 @@ class AcfunIE(BasicAcfunInfoExtractor): class AcfunBangumiIE(BasicAcfunInfoExtractor): - _VALID_URL = r"https?://www\.acfun\.cn/bangumi/aa(?P[_\d]+)" - _TEST = { - "url": "https://www.acfun.cn/bangumi/aa6002917_36188_1748679", - "info_dict": { - "id": "6002917_36188_1748679", - "ext": "mp4", - "duration": 1437.076, - "title": "租借女友 第12话 告白和女友", + _VALID_URL = r"https?://www\.acfun\.cn/bangumi/aa(?P\d+)(?P[_\d]+)?" + _TESTS = [ + { + "note": "single episode", + "url": "https://www.acfun.cn/bangumi/aa6002917_36188_1748679", + "info_dict": { + "id": "6002917_36188_1748679", + "ext": "mp4", + "duration": 1437.076, + "title": "租借女友 第12话 告白和女友", + }, }, - } + { + "note": "all episodes of bangumi", + "url": "https://www.acfun.cn/bangumi/aa6002917", + "info_dict": { + "id": "6002917", + "title": "租借女友", + }, + "playlist_count": 12, + }, + ] + + _TEMPLATE_URL = "https://www.acfun.cn/bangumi/aa%s%s" + _FETCH_EPISODES_URL = "https://www.acfun.cn/bangumi/aa%s?pagelets=pagelet_partlist&reqID=0&ajaxpipe=1&t=%d" + + def _all_episodes(self, bangumi_id): + timestamp = int_or_none(float_or_none(time.time(), invscale=1000)) + print("Timestamp: ", timestamp) + webpage = self._download_webpage( + self._FETCH_EPISODES_URL % (bangumi_id, timestamp), + bangumi_id, + headers=self._FAKE_HEADERS, + ) + entries = [ + self.url_result(self._TEMPLATE_URL % (bangumi_id, eid), self.IE_NAME, eid) + for eid in re.findall( + r"data-href=./bangumi/aa%s([_\d]+)." % bangumi_id, webpage + ) + ] + return entries def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, headers=self._FAKE_HEADERS) + bangumi_id, episode_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, bangumi_id, headers=self._FAKE_HEADERS) json_text = self._html_search_regex( r"(?s)bangumiData\s*=\s*(\{.*?\});", webpage, "json_text" ) json_data = json.loads(json_text) - title = ( - json_data.get("showTitle") - or json_data["bangumiTitle"] - + " " - + json_data["episodeName"] - + " " - + json_data["title"] + bangumiTitle = json_data["bangumiTitle"] + + if not episode_id: + if not self._downloader.params.get("noplaylist"): + self.to_screen( + "Downloading all episodes of %s(aa%s) - add --no-playlist to just download first episode" + % (bangumiTitle, bangumi_id) + ) + playlist = self.playlist_result( + self._all_episodes(bangumi_id), bangumi_id, bangumiTitle + ) + return playlist + + self.to_screen( + "Downloading just first episode %s(aa%s) because of --no-playlist" + % (bangumiTitle, bangumi_id) + ) + + title = json_data.get("showTitle") or "%s %s %s" % ( + json_data["bangumiTitle"], + json_data["episodeName"], + json_data["title"], ) currentVideoInfo = json_data.get("currentVideoInfo") durationMillis = currentVideoInfo.get("durationMillis") - duration = float_or_none(durationMillis) / 1000.0 + duration = float_or_none(durationMillis, 1000) + + if episode_id: + bangumi_id += episode_id formats = self._extract_formats(currentVideoInfo) return { - "id": video_id, + "id": bangumi_id, "title": title, - "duration": float_or_none(duration), + "duration": duration, "formats": formats, } @@ -211,23 +260,31 @@ class AcfunBangumiIE(BasicAcfunInfoExtractor): class AcfunLiveIE(BasicAcfunInfoExtractor): _VALID_URL = r"https?://live\.acfun\.cn/live/(?P\d+)" _TEST = { - "url": "https://live.acfun.cn/live/36782183", - "only_matching": True, + "url": "https://live.acfun.cn/live/34195163", "info_dict": { - "id": "36782183", + "id": "34195163", "ext": "mp4", - # 'title': '看见兔兔就烦!', + "title": r"re:^晴心Haruko \d{4}-\d{2}-\d{2} \d{2}:\d{2}$", "is_live": True, }, + "only_matching": True, } + _LOGIN_URL = "https://id.app.acfun.cn/rest/app/visitor/login" + _STREAMS_URL = "https://api.kuaishouzt.com/rest/zt/live/web/startPlay?subBiz=mainApp&kpn=ACFUN_APP&kpf=PC_WEB&userId=%d&did=%s&acfun.api.visitor_st=%s" + def _real_extract(self, url): live_id = self._match_id(url) self._FAKE_HEADERS.update({"Referer": url}) - # Firstly get _did cookie - fisrt_req = sanitized_Request(url, headers=self._FAKE_HEADERS) - first_res = compat_urllib_request.urlopen(fisrt_req) + # Firstly fetch _did cookie and streamer name(use for title) + first_req = sanitized_Request(url, headers=self._FAKE_HEADERS) + webpage, first_res = self._download_webpage_handle(first_req, live_id) + live_up_name = self._html_search_regex( + r"]*?class[^>]*?up-name[^>]*?>([^<]*?)", + webpage, + "live_up_name", + ) for header_name, header_value in first_res.info().items(): if header_name.lower() == "set-cookie": @@ -241,25 +298,24 @@ class AcfunLiveIE(BasicAcfunInfoExtractor): self._FAKE_HEADERS.update({"Cookie": "_did=%s" % did_cookie}) # Login to get userId and acfun.api.visitor_st - login_data = compat_urllib_parse_urlencode({"sid": "acfun.api.visitor"}).encode( - "ascii" - ) + login_data = urlencode_postdata({"sid": "acfun.api.visitor"}) login_json = self._download_json( - "https://id.app.acfun.cn/rest/app/visitor/login", + self._LOGIN_URL, live_id, data=login_data, headers=self._FAKE_HEADERS, ) - streams_url = ( - "https://api.kuaishouzt.com/rest/zt/live/web/startPlay?subBiz=mainApp&kpn=ACFUN_APP&kpf=PC_WEB&userId=%d&did=%s&acfun.api.visitor_st=%s" - % (login_json["userId"], did_cookie, login_json["acfun.api.visitor_st"]) + streams_url = self._STREAMS_URL % ( + login_json["userId"], + did_cookie, + login_json["acfun.api.visitor_st"], ) # Fetch stream lists - fetch_streams_data = compat_urllib_parse_urlencode( + fetch_streams_data = urlencode_postdata( {"authorId": int_or_none(live_id), "pullStreamType": "FLV"} - ).encode("ascii") + ) streams_json = self._download_json( streams_url, live_id, data=fetch_streams_data, headers=self._FAKE_HEADERS @@ -270,8 +326,7 @@ class AcfunLiveIE(BasicAcfunInfoExtractor): except AssertionError: raise ExtractorError("This live room is currently closed") - title = streams_json["data"]["caption"] - streams_info = json.loads(streams_json["data"]["videoPlayRes"]) # streams info + streams_info = json.loads(streams_json["data"]["videoPlayRes"]) representation = streams_info["liveAdaptiveManifest"][0]["adaptationSet"][ "representation" ] @@ -288,7 +343,7 @@ class AcfunLiveIE(BasicAcfunInfoExtractor): self._sort_formats(formats) return { "id": live_id, - "title": self._live_title(title), + "title": self._live_title(live_up_name), "formats": formats, "is_live": True, } From c21015f533b5282ac6d1cc9f521b7854792216bf Mon Sep 17 00:00:00 2001 From: johnsmith2077 Date: Wed, 7 Oct 2020 18:06:38 +0800 Subject: [PATCH 7/7] [acfun] use hls instead of flv for live stream to avoid broken video --- youtube_dl/extractor/acfun.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/acfun.py b/youtube_dl/extractor/acfun.py index 7be5db967..76c31ede0 100644 --- a/youtube_dl/extractor/acfun.py +++ b/youtube_dl/extractor/acfun.py @@ -194,7 +194,6 @@ class AcfunBangumiIE(BasicAcfunInfoExtractor): def _all_episodes(self, bangumi_id): timestamp = int_or_none(float_or_none(time.time(), invscale=1000)) - print("Timestamp: ", timestamp) webpage = self._download_webpage( self._FETCH_EPISODES_URL % (bangumi_id, timestamp), bangumi_id, @@ -333,9 +332,15 @@ class AcfunLiveIE(BasicAcfunInfoExtractor): formats = [] for stream in representation: + # use hls instead of flv to fix video broken problem when stopped + i = stream["url"].index("flv?") + u3m8_url = ( + stream["url"][0:i].replace("pull.etoote.com", "hlspull.etoote.com") + + "m3u8" + ) formats += [ { - "url": stream["url"], + "url": u3m8_url, "ext": "mp4", "tbr": stream.get("bitrate"), }