1
0
mirror of https://codeberg.org/polarisfm/youtube-dl synced 2024-12-03 05:47:55 +01:00
youtube-dl/youtube_dl/extractor/acfun.py

295 lines
9.7 KiB
Python
Raw Normal View History

2020-10-05 06:38:25 +02:00
# coding: utf-8
from __future__ import unicode_literals
import json
import re
2020-10-05 06:38:25 +02:00
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse_urlencode,
compat_urllib_request,
)
from ..utils import (
int_or_none,
float_or_none,
str_or_none,
str_to_int,
sanitized_Request,
ExtractorError,
)
2020-10-05 12:14:42 +02:00
2020-10-05 06:38:25 +02:00
class BasicAcfunInfoExtractor(InfoExtractor):
_FAKE_HEADERS = {
2020-10-05 12:14:42 +02:00
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", # noqa
"Accept-Charset": "UTF-8,*;q=0.5",
"Accept-Encoding": "gzip,deflate,sdch",
"Accept-Language": "en-US,en;q=0.8",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0", # noqa
}
2020-10-05 06:38:25 +02:00
def _extract_formats(self, currentVideoInfo):
2020-10-05 12:14:42 +02:00
durationMillis = currentVideoInfo.get("durationMillis")
if "ksPlayJson" in currentVideoInfo:
ksPlayJson = ksPlayJson = json.loads(currentVideoInfo["ksPlayJson"])
representation = ksPlayJson.get("adaptationSet")[0].get("representation")
2020-10-05 06:38:25 +02:00
formats = []
for stream in representation:
size = float_or_none(durationMillis) * stream["avgBitrate"] / 8
2020-10-05 12:14:42 +02:00
formats += [
{
"url": stream["url"],
"ext": "mp4",
"width": stream.get("width"),
"height": stream.get("height"),
"filesize": size,
}
]
formats = formats[::-1]
2020-10-05 06:38:25 +02:00
self._sort_formats(formats)
2020-10-05 12:14:42 +02:00
return formats
2020-10-05 06:38:25 +02:00
class AcfunIE(BasicAcfunInfoExtractor):
_VALID_URL = r"https?://www\.acfun\.cn/v/ac(?P<id>\d+)(?P<page_id>[_\d]+)?"
2020-10-05 06:38:25 +02:00
_TESTS = [
{
"note": "single video without playlist",
2020-10-05 12:14:42 +02:00
"url": "https://www.acfun.cn/v/ac18184362",
"info_dict": {
"id": "18184362",
"ext": "mp4",
"duration": 192.042,
"title": "【AC娘】魔性新单《极乐857》上线来和AC娘一起云蹦迪吧",
"uploader": "AC娘本体",
"uploader_id": 23682490,
},
2020-10-05 06:38:25 +02:00
},
{
"note": "single video in playlist",
2020-10-05 12:14:42 +02:00
"url": "https://www.acfun.cn/v/ac17532274_3",
"info_dict": {
2020-10-06 20:39:21 +02:00
"id": "17532274_3",
2020-10-05 12:14:42 +02:00
"ext": "mp4",
"duration": 233.770,
2020-10-06 20:39:21 +02:00
"title": "【AC娘x竾颜音】【周六狂欢24小时】TRAP七夕恋歌落入本娘爱的陷阱-TRAP 阿婵",
2020-10-05 12:14:42 +02:00
"uploader": "AC娘本体",
"uploader_id": 23682490,
},
},
{
"note": "multiple video with playlist",
"url": "https://www.acfun.cn/v/ac17532274",
"info_dict": {
"id": "17532274",
"title": "【AC娘x竾颜音】【周六狂欢24小时】TRAP七夕恋歌落入本娘爱的陷阱",
"uploader": "AC娘本体",
"uploader_id": 23682490,
},
2020-10-06 20:39:21 +02:00
"playlist_count": 5,
},
2020-10-05 12:14:42 +02:00
]
2020-10-05 06:38:25 +02:00
def _real_extract(self, url):
video_id, page_id = re.match(self._VALID_URL, url).groups()
2020-10-05 06:38:25 +02:00
webpage = self._download_webpage(url, video_id, headers=self._FAKE_HEADERS)
2020-10-05 12:14:42 +02:00
json_text = self._html_search_regex(
r"(?s)videoInfo\s*=\s*(\{.*?\});", webpage, "json_text"
)
2020-10-06 20:39:21 +02:00
json_data = json.loads(json_text)
2020-10-05 06:38:25 +02:00
2020-10-05 12:14:42 +02:00
title = json_data["title"]
uploader = str_or_none(json_data.get("user").get("name"))
2020-10-06 20:39:21 +02:00
uploader_id = str_to_int(json_data.get("user").get("id"))
2020-10-06 20:39:21 +02:00
videoList = json_data.get("videoList")
if videoList:
video_num = len(videoList)
2020-10-06 20:39:21 +02:00
if not page_id and video_num and video_num > 1:
2020-10-06 20:39:21 +02:00
if not self._downloader.params.get("noplaylist"):
self.to_screen(
"Downloading all pages %s - add --no-playlist to just download video"
% video_id
)
entries = [
self.url_result(
"%s_%d" % (url, pid),
self.IE_NAME,
video_id="%s_%d" % (video_id, pid),
)
for pid in range(1, video_num + 1)
]
playlist = self.playlist_result(entries, video_id, title)
2020-10-06 20:39:21 +02:00
playlist.update(
{
"uploader": uploader,
"uploader_id": uploader_id,
}
)
return playlist
2020-10-06 20:39:21 +02:00
self.to_screen(
"Downloading just video %s because of --no-playlist" % video_id
)
2020-10-05 12:14:42 +02:00
p_title = self._html_search_regex(
r"<li\s[^<]*?class='[^']*active[^']*'.*?>(.*?)</li>",
webpage,
"p_title",
default=None,
2020-10-06 20:39:21 +02:00
)
2020-10-05 12:14:42 +02:00
if p_title:
2020-10-06 20:39:21 +02:00
title = "%s-%s" % (title, p_title)
2020-10-05 06:38:25 +02:00
if page_id:
2020-10-06 20:39:21 +02:00
video_id += page_id
2020-10-05 12:14:42 +02:00
currentVideoInfo = json_data.get("currentVideoInfo")
durationMillis = currentVideoInfo.get("durationMillis")
2020-10-05 12:42:49 +02:00
duration = float_or_none(durationMillis) / 1000.0
2020-10-05 06:38:25 +02:00
formats = self._extract_formats(currentVideoInfo)
return {
2020-10-05 12:14:42 +02:00
"id": video_id,
"uploader_id": uploader_id,
2020-10-05 12:14:42 +02:00
"title": title,
"uploader": uploader,
"duration": duration,
2020-10-05 12:14:42 +02:00
"formats": formats,
2020-10-05 06:38:25 +02:00
}
class AcfunBangumiIE(BasicAcfunInfoExtractor):
2020-10-05 12:14:42 +02:00
_VALID_URL = r"https?://www\.acfun\.cn/bangumi/aa(?P<id>[_\d]+)"
2020-10-05 06:38:25 +02:00
_TEST = {
2020-10-05 12:14:42 +02:00
"url": "https://www.acfun.cn/bangumi/aa6002917_36188_1748679",
"info_dict": {
"id": "6002917_36188_1748679",
"ext": "mp4",
"duration": 1437.076,
"title": "租借女友 第12话 告白和女友",
},
2020-10-05 06:38:25 +02:00
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id, headers=self._FAKE_HEADERS)
2020-10-05 12:14:42 +02:00
json_text = self._html_search_regex(
r"(?s)bangumiData\s*=\s*(\{.*?\});", webpage, "json_text"
)
2020-10-05 06:38:25 +02:00
json_data = json.loads(json_text)
2020-10-05 12:14:42 +02:00
title = (
json_data.get("showTitle")
or json_data["bangumiTitle"]
+ " "
+ json_data["episodeName"]
+ " "
+ json_data["title"]
)
2020-10-05 06:38:25 +02:00
2020-10-05 12:14:42 +02:00
currentVideoInfo = json_data.get("currentVideoInfo")
durationMillis = currentVideoInfo.get("durationMillis")
2020-10-05 12:42:49 +02:00
duration = float_or_none(durationMillis) / 1000.0
2020-10-05 06:38:25 +02:00
formats = self._extract_formats(currentVideoInfo)
return {
2020-10-05 12:14:42 +02:00
"id": video_id,
"title": title,
"duration": float_or_none(duration),
"formats": formats,
2020-10-05 06:38:25 +02:00
}
2020-10-05 12:14:42 +02:00
2020-10-05 06:38:25 +02:00
class AcfunLiveIE(BasicAcfunInfoExtractor):
2020-10-05 12:14:42 +02:00
_VALID_URL = r"https?://live\.acfun\.cn/live/(?P<id>\d+)"
2020-10-05 06:38:25 +02:00
_TEST = {
2020-10-05 12:14:42 +02:00
"url": "https://live.acfun.cn/live/36782183",
"only_matching": True,
"info_dict": {
"id": "36782183",
"ext": "mp4",
2020-10-05 06:38:25 +02:00
# 'title': '看见兔兔就烦!',
2020-10-05 12:14:42 +02:00
"is_live": True,
},
2020-10-05 06:38:25 +02:00
}
def _real_extract(self, url):
live_id = self._match_id(url)
2020-10-05 12:14:42 +02:00
self._FAKE_HEADERS.update({"Referer": url})
2020-10-05 06:38:25 +02:00
# Firstly get _did cookie
fisrt_req = sanitized_Request(url, headers=self._FAKE_HEADERS)
first_res = compat_urllib_request.urlopen(fisrt_req)
for header_name, header_value in first_res.info().items():
2020-10-05 12:14:42 +02:00
if header_name.lower() == "set-cookie":
2020-10-05 06:38:25 +02:00
cookies = header_value
if not cookies:
2020-10-05 12:14:42 +02:00
raise ExtractorError("Fail to fetch cookies")
2020-10-05 06:38:25 +02:00
2020-10-05 12:14:42 +02:00
cookies_dict = dict(c.strip(" ,").split("=", 1) for c in cookies.split(";"))
did_cookie = cookies_dict["_did"]
2020-10-05 06:38:25 +02:00
2020-10-05 12:14:42 +02:00
self._FAKE_HEADERS.update({"Cookie": "_did=%s" % did_cookie})
2020-10-05 06:38:25 +02:00
# Login to get userId and acfun.api.visitor_st
2020-10-05 12:14:42 +02:00
login_data = compat_urllib_parse_urlencode({"sid": "acfun.api.visitor"}).encode(
"ascii"
)
2020-10-05 06:38:25 +02:00
login_json = self._download_json(
2020-10-05 12:14:42 +02:00
"https://id.app.acfun.cn/rest/app/visitor/login",
live_id,
data=login_data,
headers=self._FAKE_HEADERS,
)
2020-10-05 06:38:25 +02:00
2020-10-05 12:14:42 +02:00
streams_url = (
"https://api.kuaishouzt.com/rest/zt/live/web/startPlay?subBiz=mainApp&kpn=ACFUN_APP&kpf=PC_WEB&userId=%d&did=%s&acfun.api.visitor_st=%s"
% (login_json["userId"], did_cookie, login_json["acfun.api.visitor_st"])
)
2020-10-05 06:38:25 +02:00
# Fetch stream lists
2020-10-05 12:14:42 +02:00
fetch_streams_data = compat_urllib_parse_urlencode(
{"authorId": int_or_none(live_id), "pullStreamType": "FLV"}
).encode("ascii")
2020-10-05 06:38:25 +02:00
streams_json = self._download_json(
2020-10-05 12:14:42 +02:00
streams_url, live_id, data=fetch_streams_data, headers=self._FAKE_HEADERS
)
2020-10-05 06:38:25 +02:00
2020-10-05 12:42:49 +02:00
try:
assert "data" in streams_json
except AssertionError:
raise ExtractorError("This live room is currently closed")
2020-10-05 12:14:42 +02:00
title = streams_json["data"]["caption"]
streams_info = json.loads(streams_json["data"]["videoPlayRes"]) # streams info
representation = streams_info["liveAdaptiveManifest"][0]["adaptationSet"][
"representation"
]
2020-10-05 06:38:25 +02:00
formats = []
for stream in representation:
2020-10-05 12:14:42 +02:00
formats += [
{
"url": stream["url"],
"ext": "mp4",
"tbr": stream.get("bitrate"),
}
]
2020-10-05 06:38:25 +02:00
self._sort_formats(formats)
return {
2020-10-05 12:14:42 +02:00
"id": live_id,
"title": self._live_title(title),
"formats": formats,
"is_live": True,
}