diff --git a/youtube_dl/extractor/acfun.py b/youtube_dl/extractor/acfun.py new file mode 100644 index 000000000..76c31ede0 --- /dev/null +++ b/youtube_dl/extractor/acfun.py @@ -0,0 +1,354 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import time +import json +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + float_or_none, + str_or_none, + str_to_int, + sanitized_Request, + urlencode_postdata, + ExtractorError, +) + + +class BasicAcfunInfoExtractor(InfoExtractor): + _FAKE_HEADERS = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", # noqa + "Accept-Charset": "UTF-8,*;q=0.5", + "Accept-Encoding": "gzip,deflate,sdch", + "Accept-Language": "en-US,en;q=0.8", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0", # noqa + } + + def _extract_formats(self, currentVideoInfo): + durationMillis = currentVideoInfo.get("durationMillis") + if "ksPlayJson" in currentVideoInfo: + ksPlayJson = ksPlayJson = json.loads(currentVideoInfo["ksPlayJson"]) + representation = ksPlayJson.get("adaptationSet")[0].get("representation") + + formats = [] + for stream in representation: + avgByterate = float_or_none(stream.get("avgBitrate"), 8) + size = float_or_none(durationMillis, invscale=avgByterate) + formats += [ + { + "url": stream["url"], + "ext": "mp4", + "width": stream.get("width"), + "height": stream.get("height"), + "filesize": size, + } + ] + formats = formats[::-1] + self._sort_formats(formats) + return formats + + +class AcfunIE(BasicAcfunInfoExtractor): + _VALID_URL = r"https?://www\.acfun\.cn/v/ac(?P\d+)(?P[_\d]+)?" + _TESTS = [ + { + "note": "single video without playlist", + "url": "https://www.acfun.cn/v/ac18184362", + "info_dict": { + "id": "18184362", + "ext": "mp4", + "duration": 192.042, + "title": "【AC娘】魔性新单《极乐857》上线!来和AC娘一起云蹦迪吧!", + "uploader": "AC娘本体", + "uploader_id": 23682490, + }, + }, + { + "note": "single video in playlist", + "url": "https://www.acfun.cn/v/ac17532274_3", + "info_dict": { + "id": "17532274_3", + "ext": "mp4", + "duration": 233.770, + "title": "【AC娘x竾颜音】【周六狂欢24小时】TRAP:七夕恋歌!落入本娘爱的陷阱!-TRAP 阿婵", + "uploader": "AC娘本体", + "uploader_id": 23682490, + }, + }, + { + "note": "multiple video within playlist", + "url": "https://www.acfun.cn/v/ac17532274", + "info_dict": { + "id": "17532274", + "title": "【AC娘x竾颜音】【周六狂欢24小时】TRAP:七夕恋歌!落入本娘爱的陷阱!", + "uploader": "AC娘本体", + "uploader_id": 23682490, + }, + "playlist_count": 5, + }, + ] + + def _real_extract(self, url): + video_id, page_id = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage(url, video_id, headers=self._FAKE_HEADERS) + + json_text = self._html_search_regex( + r"(?s)videoInfo\s*=\s*(\{.*?\});", webpage, "json_text" + ) + json_data = json.loads(json_text) + + title = json_data["title"] + + uploader = str_or_none(json_data.get("user").get("name")) + uploader_id = str_to_int(json_data.get("user").get("id")) + + videoList = json_data.get("videoList") + if videoList: + video_num = len(videoList) + + if not page_id and video_num and video_num > 1: + if not self._downloader.params.get("noplaylist"): + self.to_screen( + "Downloading all pages of %s(ac%s) - add --no-playlist to just download video" + % (title, video_id) + ) + entries = [ + self.url_result( + "%s_%d" % (url, pid), + self.IE_NAME, + video_id="%s_%d" % (video_id, pid), + ) + for pid in range(1, video_num + 1) + ] + playlist = self.playlist_result(entries, video_id, title) + playlist.update( + { + "uploader": uploader, + "uploader_id": uploader_id, + } + ) + return playlist + + self.to_screen( + "Downloading just video %s(ac%s) because of --no-playlist" + % (title, video_id) + ) + + p_title = self._html_search_regex( + r"(.*?)", + webpage, + "p_title", + default=None, + ) + + if p_title: + title = "%s-%s" % (title, p_title) + + if page_id: + video_id += page_id + + currentVideoInfo = json_data.get("currentVideoInfo") + durationMillis = currentVideoInfo.get("durationMillis") + duration = float_or_none(durationMillis, 1000) + + formats = self._extract_formats(currentVideoInfo) + return { + "id": video_id, + "uploader_id": uploader_id, + "title": title, + "uploader": uploader, + "duration": duration, + "formats": formats, + } + + +class AcfunBangumiIE(BasicAcfunInfoExtractor): + _VALID_URL = r"https?://www\.acfun\.cn/bangumi/aa(?P\d+)(?P[_\d]+)?" + _TESTS = [ + { + "note": "single episode", + "url": "https://www.acfun.cn/bangumi/aa6002917_36188_1748679", + "info_dict": { + "id": "6002917_36188_1748679", + "ext": "mp4", + "duration": 1437.076, + "title": "租借女友 第12话 告白和女友", + }, + }, + { + "note": "all episodes of bangumi", + "url": "https://www.acfun.cn/bangumi/aa6002917", + "info_dict": { + "id": "6002917", + "title": "租借女友", + }, + "playlist_count": 12, + }, + ] + + _TEMPLATE_URL = "https://www.acfun.cn/bangumi/aa%s%s" + _FETCH_EPISODES_URL = "https://www.acfun.cn/bangumi/aa%s?pagelets=pagelet_partlist&reqID=0&ajaxpipe=1&t=%d" + + def _all_episodes(self, bangumi_id): + timestamp = int_or_none(float_or_none(time.time(), invscale=1000)) + webpage = self._download_webpage( + self._FETCH_EPISODES_URL % (bangumi_id, timestamp), + bangumi_id, + headers=self._FAKE_HEADERS, + ) + entries = [ + self.url_result(self._TEMPLATE_URL % (bangumi_id, eid), self.IE_NAME, eid) + for eid in re.findall( + r"data-href=./bangumi/aa%s([_\d]+)." % bangumi_id, webpage + ) + ] + return entries + + def _real_extract(self, url): + bangumi_id, episode_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, bangumi_id, headers=self._FAKE_HEADERS) + + json_text = self._html_search_regex( + r"(?s)bangumiData\s*=\s*(\{.*?\});", webpage, "json_text" + ) + json_data = json.loads(json_text) + + bangumiTitle = json_data["bangumiTitle"] + + if not episode_id: + if not self._downloader.params.get("noplaylist"): + self.to_screen( + "Downloading all episodes of %s(aa%s) - add --no-playlist to just download first episode" + % (bangumiTitle, bangumi_id) + ) + playlist = self.playlist_result( + self._all_episodes(bangumi_id), bangumi_id, bangumiTitle + ) + return playlist + + self.to_screen( + "Downloading just first episode %s(aa%s) because of --no-playlist" + % (bangumiTitle, bangumi_id) + ) + + title = json_data.get("showTitle") or "%s %s %s" % ( + json_data["bangumiTitle"], + json_data["episodeName"], + json_data["title"], + ) + + currentVideoInfo = json_data.get("currentVideoInfo") + durationMillis = currentVideoInfo.get("durationMillis") + duration = float_or_none(durationMillis, 1000) + + if episode_id: + bangumi_id += episode_id + + formats = self._extract_formats(currentVideoInfo) + return { + "id": bangumi_id, + "title": title, + "duration": duration, + "formats": formats, + } + + +class AcfunLiveIE(BasicAcfunInfoExtractor): + _VALID_URL = r"https?://live\.acfun\.cn/live/(?P\d+)" + _TEST = { + "url": "https://live.acfun.cn/live/34195163", + "info_dict": { + "id": "34195163", + "ext": "mp4", + "title": r"re:^晴心Haruko \d{4}-\d{2}-\d{2} \d{2}:\d{2}$", + "is_live": True, + }, + "only_matching": True, + } + + _LOGIN_URL = "https://id.app.acfun.cn/rest/app/visitor/login" + _STREAMS_URL = "https://api.kuaishouzt.com/rest/zt/live/web/startPlay?subBiz=mainApp&kpn=ACFUN_APP&kpf=PC_WEB&userId=%d&did=%s&acfun.api.visitor_st=%s" + + def _real_extract(self, url): + live_id = self._match_id(url) + self._FAKE_HEADERS.update({"Referer": url}) + + # Firstly fetch _did cookie and streamer name(use for title) + first_req = sanitized_Request(url, headers=self._FAKE_HEADERS) + webpage, first_res = self._download_webpage_handle(first_req, live_id) + live_up_name = self._html_search_regex( + r"]*?class[^>]*?up-name[^>]*?>([^<]*?)", + webpage, + "live_up_name", + ) + + for header_name, header_value in first_res.info().items(): + if header_name.lower() == "set-cookie": + cookies = header_value + if not cookies: + raise ExtractorError("Fail to fetch cookies") + + cookies_dict = dict(c.strip(" ,").split("=", 1) for c in cookies.split(";")) + did_cookie = cookies_dict["_did"] + + self._FAKE_HEADERS.update({"Cookie": "_did=%s" % did_cookie}) + + # Login to get userId and acfun.api.visitor_st + login_data = urlencode_postdata({"sid": "acfun.api.visitor"}) + login_json = self._download_json( + self._LOGIN_URL, + live_id, + data=login_data, + headers=self._FAKE_HEADERS, + ) + + streams_url = self._STREAMS_URL % ( + login_json["userId"], + did_cookie, + login_json["acfun.api.visitor_st"], + ) + + # Fetch stream lists + fetch_streams_data = urlencode_postdata( + {"authorId": int_or_none(live_id), "pullStreamType": "FLV"} + ) + + streams_json = self._download_json( + streams_url, live_id, data=fetch_streams_data, headers=self._FAKE_HEADERS + ) + + try: + assert "data" in streams_json + except AssertionError: + raise ExtractorError("This live room is currently closed") + + streams_info = json.loads(streams_json["data"]["videoPlayRes"]) + representation = streams_info["liveAdaptiveManifest"][0]["adaptationSet"][ + "representation" + ] + + formats = [] + for stream in representation: + # use hls instead of flv to fix video broken problem when stopped + i = stream["url"].index("flv?") + u3m8_url = ( + stream["url"][0:i].replace("pull.etoote.com", "hlspull.etoote.com") + + "m3u8" + ) + formats += [ + { + "url": u3m8_url, + "ext": "mp4", + "tbr": stream.get("bitrate"), + } + ] + self._sort_formats(formats) + return { + "id": live_id, + "title": self._live_title(live_up_name), + "formats": formats, + "is_live": True, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ae7079a6a..7023ca3d9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -18,6 +18,11 @@ from .acast import ( ACastIE, ACastChannelIE, ) +from .acfun import ( + AcfunIE, + AcfunBangumiIE, + AcfunLiveIE, +) from .adn import ADNIE from .adobeconnect import AdobeConnectIE from .adobetv import (