2016-10-02 13:39:18 +02:00
# coding: utf-8
2014-03-27 21:01:09 +01:00
from __future__ import unicode_literals
2013-11-23 10:19:44 +01:00
2014-10-29 11:04:48 +01:00
import json
2015-04-29 20:24:05 +02:00
import datetime
2018-09-08 22:46:40 +02:00
import re
2013-11-23 10:19:44 +01:00
2018-09-08 22:46:40 +02:00
from . common import InfoExtractor , SearchInfoExtractor
2014-12-13 12:24:42 +01:00
from . . compat import (
2017-06-07 08:47:25 +02:00
compat_parse_qs ,
2013-11-23 10:19:44 +01:00
compat_urlparse ,
2014-12-13 12:24:42 +01:00
)
from . . utils import (
2017-06-07 08:47:25 +02:00
determine_ext ,
2017-08-06 17:07:06 +02:00
dict_get ,
2014-11-05 19:52:34 +01:00
ExtractorError ,
2014-12-13 12:24:42 +01:00
int_or_none ,
2017-08-06 18:19:46 +02:00
float_or_none ,
2014-12-13 12:24:42 +01:00
parse_duration ,
2015-04-29 18:24:24 +02:00
parse_iso8601 ,
2017-08-06 18:19:46 +02:00
remove_start ,
2017-08-06 17:07:06 +02:00
try_get ,
unified_timestamp ,
2016-03-25 21:19:24 +01:00
urlencode_postdata ,
2017-06-07 08:47:25 +02:00
xpath_text ,
2013-11-23 10:19:44 +01:00
)
2013-11-24 06:37:14 +01:00
2013-11-23 10:19:44 +01:00
class NiconicoIE ( InfoExtractor ) :
2014-03-27 21:01:09 +01:00
IE_NAME = ' niconico '
IE_DESC = ' ニコニコ動画 '
2013-11-23 10:19:44 +01:00
2015-03-20 15:53:14 +01:00
_TESTS = [ {
2014-03-27 21:01:09 +01:00
' url ' : ' http://www.nicovideo.jp/watch/sm22312215 ' ,
' md5 ' : ' d1a75c0823e2f629128c43e1212760f9 ' ,
' info_dict ' : {
' id ' : ' sm22312215 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Big Buck Bunny ' ,
2017-08-06 17:07:06 +02:00
' thumbnail ' : r ' re:https?://.* ' ,
2014-03-27 21:01:09 +01:00
' uploader ' : ' takuya0301 ' ,
' uploader_id ' : ' 2698420 ' ,
' upload_date ' : ' 20131123 ' ,
2017-10-15 05:40:57 +02:00
' timestamp ' : int , # timestamp is unstable
2014-03-27 21:01:09 +01:00
' description ' : ' (c) copyright 2008, Blender Foundation / www.bigbuckbunny.org ' ,
2014-07-04 17:05:46 +02:00
' duration ' : 33 ,
2017-08-06 17:07:06 +02:00
' view_count ' : int ,
' comment_count ' : int ,
2013-11-23 10:19:44 +01:00
} ,
2017-01-15 15:10:57 +01:00
' skip ' : ' Requires an account ' ,
2015-03-20 15:53:14 +01:00
} , {
2015-04-29 18:47:52 +02:00
# File downloaded with and without credentials are different, so omit
# the md5 field
2015-03-20 15:53:14 +01:00
' url ' : ' http://www.nicovideo.jp/watch/nm14296458 ' ,
' info_dict ' : {
' id ' : ' nm14296458 ' ,
' ext ' : ' swf ' ,
' title ' : ' 【鏡音リン】Dance on media【オリジナル】take2! ' ,
2015-04-29 18:24:24 +02:00
' description ' : ' md5:689f066d74610b3b22e0f1739add0f58 ' ,
2017-08-06 17:07:06 +02:00
' thumbnail ' : r ' re:https?://.* ' ,
2015-03-20 15:53:14 +01:00
' uploader ' : ' りょうた ' ,
' uploader_id ' : ' 18822557 ' ,
' upload_date ' : ' 20110429 ' ,
2015-04-29 18:24:24 +02:00
' timestamp ' : 1304065916 ,
2015-03-20 15:53:14 +01:00
' duration ' : 209 ,
} ,
2017-01-15 15:10:57 +01:00
' skip ' : ' Requires an account ' ,
2015-04-29 18:24:24 +02:00
} , {
# 'video exists but is marked as "deleted"
2015-04-29 20:24:05 +02:00
# md5 is unstable
2015-04-29 18:24:24 +02:00
' url ' : ' http://www.nicovideo.jp/watch/sm10000 ' ,
' info_dict ' : {
' id ' : ' sm10000 ' ,
' ext ' : ' unknown_video ' ,
' description ' : ' deleted ' ,
' title ' : ' ドラえもんエターナル第3話「決戦第3新東京市」< 前編> ' ,
2017-08-06 17:07:06 +02:00
' thumbnail ' : r ' re:https?://.* ' ,
2015-04-29 20:24:05 +02:00
' upload_date ' : ' 20071224 ' ,
2017-01-15 15:10:57 +01:00
' timestamp ' : int , # timestamp field has different value if logged in
2015-04-29 20:24:05 +02:00
' duration ' : 304 ,
2017-08-06 17:07:06 +02:00
' view_count ' : int ,
2015-04-29 18:24:24 +02:00
} ,
2017-01-15 15:10:57 +01:00
' skip ' : ' Requires an account ' ,
2015-04-30 11:05:02 +02:00
} , {
' url ' : ' http://www.nicovideo.jp/watch/so22543406 ' ,
' info_dict ' : {
' id ' : ' 1388129933 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' 【第1回】RADIOアニメロミックス ラブライブ! ~ のぞえりRadio Garden~ ' ,
' description ' : ' md5:b27d224bb0ff53d3c8269e9f8b561cf1 ' ,
2017-08-06 17:07:06 +02:00
' thumbnail ' : r ' re:https?://.* ' ,
2015-04-30 11:05:02 +02:00
' timestamp ' : 1388851200 ,
' upload_date ' : ' 20140104 ' ,
' uploader ' : ' アニメロチャンネル ' ,
' uploader_id ' : ' 312 ' ,
2017-01-15 15:10:57 +01:00
} ,
' skip ' : ' The viewing period of the video you were searching for has expired. ' ,
2017-08-06 17:07:06 +02:00
} , {
2017-08-06 18:19:46 +02:00
# video not available via `getflv`; "old" HTML5 video
2017-08-06 17:07:06 +02:00
' url ' : ' http://www.nicovideo.jp/watch/sm1151009 ' ,
2017-08-06 18:19:46 +02:00
' md5 ' : ' 8fa81c364eb619d4085354eab075598a ' ,
2017-08-06 17:07:06 +02:00
' info_dict ' : {
' id ' : ' sm1151009 ' ,
2017-08-06 18:19:46 +02:00
' ext ' : ' mp4 ' ,
2017-08-06 17:07:06 +02:00
' title ' : ' マスターシステム本体内蔵のスペハリのメインテーマ(PSG版) ' ,
' description ' : ' md5:6ee077e0581ff5019773e2e714cdd0b7 ' ,
' thumbnail ' : r ' re:https?://.* ' ,
' duration ' : 184 ,
' timestamp ' : 1190868283 ,
' upload_date ' : ' 20070927 ' ,
' uploader ' : ' denden2 ' ,
' uploader_id ' : ' 1392194 ' ,
' view_count ' : int ,
' comment_count ' : int ,
} ,
' skip ' : ' Requires an account ' ,
2017-08-06 18:19:46 +02:00
} , {
# "New" HTML5 video
2017-10-15 05:40:57 +02:00
# md5 is unstable
2017-08-06 18:19:46 +02:00
' url ' : ' http://www.nicovideo.jp/watch/sm31464864 ' ,
' info_dict ' : {
' id ' : ' sm31464864 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' 新作TVアニメ「戦姫絶唱シンフォギアAXZ」PV 最高画質 ' ,
' description ' : ' md5:e52974af9a96e739196b2c1ca72b5feb ' ,
' timestamp ' : 1498514060 ,
' upload_date ' : ' 20170626 ' ,
2017-10-15 05:40:57 +02:00
' uploader ' : ' ゲスト ' ,
2017-08-06 18:19:46 +02:00
' uploader_id ' : ' 40826363 ' ,
' thumbnail ' : r ' re:https?://.* ' ,
' duration ' : 198 ,
' view_count ' : int ,
' comment_count ' : int ,
} ,
' skip ' : ' Requires an account ' ,
2017-10-15 05:40:57 +02:00
} , {
# Video without owner
' url ' : ' http://www.nicovideo.jp/watch/sm18238488 ' ,
' md5 ' : ' d265680a1f92bdcbbd2a507fc9e78a9e ' ,
' info_dict ' : {
' id ' : ' sm18238488 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' 【実写版】ミュータントタートルズ ' ,
' description ' : ' md5:15df8988e47a86f9e978af2064bf6d8e ' ,
' timestamp ' : 1341160408 ,
' upload_date ' : ' 20120701 ' ,
' uploader ' : None ,
' uploader_id ' : None ,
' thumbnail ' : r ' re:https?://.* ' ,
' duration ' : 5271 ,
' view_count ' : int ,
' comment_count ' : int ,
} ,
' skip ' : ' Requires an account ' ,
2017-06-25 21:30:05 +02:00
} , {
' url ' : ' http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg ' ,
' only_matching ' : True ,
2015-03-20 15:53:14 +01:00
} ]
2013-11-23 10:19:44 +01:00
2017-06-25 21:30:05 +02:00
_VALID_URL = r ' https?://(?:www \ .|secure \ .|sp \ .)?nicovideo \ .jp/watch/(?P<id>(?:[a-z] {2} )?[0-9]+) '
2013-11-23 10:19:44 +01:00
_NETRC_MACHINE = ' niconico '
def _real_initialize ( self ) :
2014-10-12 22:18:42 +02:00
self . _login ( )
2013-11-23 10:19:44 +01:00
def _login ( self ) :
2018-05-26 17:12:44 +02:00
username , password = self . _get_login_info ( )
2014-10-12 22:18:42 +02:00
# No authentication to be performed
if not username :
return True
2013-11-23 10:19:44 +01:00
# Log in
2017-06-07 08:47:25 +02:00
login_ok = True
2013-11-23 10:19:44 +01:00
login_form_strs = {
2017-06-07 08:47:25 +02:00
' mail_tel ' : username ,
2014-03-27 21:01:09 +01:00
' password ' : password ,
2013-11-23 10:19:44 +01:00
}
2017-06-07 08:47:25 +02:00
urlh = self . _request_webpage (
' https://account.nicovideo.jp/api/v1/login ' , None ,
note = ' Logging in ' , errnote = ' Unable to log in ' ,
data = urlencode_postdata ( login_form_strs ) )
if urlh is False :
login_ok = False
else :
parts = compat_urlparse . urlparse ( urlh . geturl ( ) )
if compat_parse_qs ( parts . query ) . get ( ' message ' , [ None ] ) [ 0 ] == ' cant_login ' :
login_ok = False
if not login_ok :
2014-03-27 21:01:09 +01:00
self . _downloader . report_warning ( ' unable to log in: bad username or password ' )
2017-06-07 08:47:25 +02:00
return login_ok
2013-11-23 10:19:44 +01:00
2017-08-06 18:19:46 +02:00
def _extract_format_for_quality ( self , api_data , video_id , audio_quality , video_quality ) :
def yesno ( boolean ) :
return ' yes ' if boolean else ' no '
session_api_data = api_data [ ' video ' ] [ ' dmcInfo ' ] [ ' session_api ' ]
session_api_endpoint = session_api_data [ ' urls ' ] [ 0 ]
format_id = ' - ' . join ( map ( lambda s : remove_start ( s [ ' id ' ] , ' archive_ ' ) , [ video_quality , audio_quality ] ) )
session_response = self . _download_json (
session_api_endpoint [ ' url ' ] , video_id ,
query = { ' _format ' : ' json ' } ,
headers = { ' Content-Type ' : ' application/json ' } ,
note = ' Downloading JSON metadata for %s ' % format_id ,
data = json . dumps ( {
' session ' : {
' client_info ' : {
' player_id ' : session_api_data [ ' player_id ' ] ,
} ,
' content_auth ' : {
' auth_type ' : session_api_data [ ' auth_types ' ] [ session_api_data [ ' protocols ' ] [ 0 ] ] ,
' content_key_timeout ' : session_api_data [ ' content_key_timeout ' ] ,
' service_id ' : ' nicovideo ' ,
' service_user_id ' : session_api_data [ ' service_user_id ' ]
} ,
' content_id ' : session_api_data [ ' content_id ' ] ,
' content_src_id_sets ' : [ {
' content_src_ids ' : [ {
' src_id_to_mux ' : {
' audio_src_ids ' : [ audio_quality [ ' id ' ] ] ,
' video_src_ids ' : [ video_quality [ ' id ' ] ] ,
}
} ]
} ] ,
' content_type ' : ' movie ' ,
' content_uri ' : ' ' ,
' keep_method ' : {
' heartbeat ' : {
' lifetime ' : session_api_data [ ' heartbeat_lifetime ' ]
}
} ,
' priority ' : session_api_data [ ' priority ' ] ,
' protocol ' : {
' name ' : ' http ' ,
' parameters ' : {
' http_parameters ' : {
' parameters ' : {
' http_output_download_parameters ' : {
' use_ssl ' : yesno ( session_api_endpoint [ ' is_ssl ' ] ) ,
' use_well_known_port ' : yesno ( session_api_endpoint [ ' is_well_known_port ' ] ) ,
}
}
}
}
} ,
' recipe_id ' : session_api_data [ ' recipe_id ' ] ,
' session_operation_auth ' : {
' session_operation_auth_by_signature ' : {
' signature ' : session_api_data [ ' signature ' ] ,
' token ' : session_api_data [ ' token ' ] ,
}
} ,
' timing_constraint ' : ' unlimited '
}
2018-09-01 11:04:45 +02:00
} ) . encode ( ) )
2017-08-06 18:19:46 +02:00
resolution = video_quality . get ( ' resolution ' , { } )
return {
' url ' : session_response [ ' data ' ] [ ' session ' ] [ ' content_uri ' ] ,
' format_id ' : format_id ,
' ext ' : ' mp4 ' , # Session API are used in HTML5, which always serves mp4
' abr ' : float_or_none ( audio_quality . get ( ' bitrate ' ) , 1000 ) ,
' vbr ' : float_or_none ( video_quality . get ( ' bitrate ' ) , 1000 ) ,
' height ' : resolution . get ( ' height ' ) ,
' width ' : resolution . get ( ' width ' ) ,
}
2013-11-23 10:19:44 +01:00
def _real_extract ( self , url ) :
2015-03-09 18:12:41 +01:00
video_id = self . _match_id ( url )
2013-11-23 10:19:44 +01:00
2015-04-29 18:24:24 +02:00
# Get video webpage. We are not actually interested in it for normal
# cases, but need the cookies in order to be able to download the
# info webpage
2015-04-30 11:05:02 +02:00
webpage , handle = self . _download_webpage_handle (
' http://www.nicovideo.jp/watch/ ' + video_id , video_id )
if video_id . startswith ( ' so ' ) :
video_id = self . _match_id ( handle . geturl ( ) )
2013-11-23 10:19:44 +01:00
2017-08-06 17:07:06 +02:00
api_data = self . _parse_json ( self . _html_search_regex (
' data-api-data= " ([^ " ]+) " ' , webpage ,
' API data ' , default = ' {} ' ) , video_id )
2017-08-06 18:19:46 +02:00
def _format_id_from_url ( video_url ) :
return ' economy ' if video_real_url . endswith ( ' low ' ) else ' normal '
try :
video_real_url = api_data [ ' video ' ] [ ' smileInfo ' ] [ ' url ' ]
except KeyError : # Flash videos
2017-08-06 17:07:06 +02:00
# Get flv info
flv_info_webpage = self . _download_webpage (
' http://flapi.nicovideo.jp/api/getflv/ ' + video_id + ' ?as3=1 ' ,
video_id , ' Downloading flv info ' )
flv_info = compat_urlparse . parse_qs ( flv_info_webpage )
if ' url ' not in flv_info :
if ' deleted ' in flv_info :
raise ExtractorError ( ' The video has been deleted. ' ,
expected = True )
elif ' closed ' in flv_info :
raise ExtractorError ( ' Niconico videos now require logging in ' ,
expected = True )
elif ' error ' in flv_info :
raise ExtractorError ( ' %s reports error: %s ' % (
self . IE_NAME , flv_info [ ' error ' ] [ 0 ] ) , expected = True )
else :
raise ExtractorError ( ' Unable to find video URL ' )
video_info_xml = self . _download_xml (
' http://ext.nicovideo.jp/api/getthumbinfo/ ' + video_id ,
video_id , note = ' Downloading video info page ' )
def get_video_info ( items ) :
if not isinstance ( items , list ) :
items = [ items ]
for item in items :
ret = xpath_text ( video_info_xml , ' .// ' + item )
if ret :
return ret
2013-11-23 10:19:44 +01:00
2017-08-06 18:19:46 +02:00
video_real_url = flv_info [ ' url ' ] [ 0 ]
extension = get_video_info ( ' movie_type ' )
if not extension :
extension = determine_ext ( video_real_url )
formats = [ {
' url ' : video_real_url ,
' ext ' : extension ,
' format_id ' : _format_id_from_url ( video_real_url ) ,
} ]
else :
formats = [ ]
dmc_info = api_data [ ' video ' ] . get ( ' dmcInfo ' )
if dmc_info : # "New" HTML5 videos
quality_info = dmc_info [ ' quality ' ]
for audio_quality in quality_info [ ' audios ' ] :
for video_quality in quality_info [ ' videos ' ] :
if not audio_quality [ ' available ' ] or not video_quality [ ' available ' ] :
continue
formats . append ( self . _extract_format_for_quality (
api_data , video_id , audio_quality , video_quality ) )
self . _sort_formats ( formats )
else : # "Old" HTML5 videos
formats = [ {
' url ' : video_real_url ,
' ext ' : ' mp4 ' ,
' format_id ' : _format_id_from_url ( video_real_url ) ,
} ]
def get_video_info ( items ) :
return dict_get ( api_data [ ' video ' ] , items )
2013-11-23 10:19:44 +01:00
# Start extracting information
2017-08-06 17:07:06 +02:00
title = get_video_info ( ' title ' )
2015-04-29 18:47:52 +02:00
if not title :
title = self . _og_search_title ( webpage , default = None )
2015-04-29 18:24:24 +02:00
if not title :
title = self . _html_search_regex (
r ' <span[^>]+class= " videoHeaderTitle " [^>]*>([^<]+)</span> ' ,
webpage , ' video title ' )
2015-04-29 20:24:05 +02:00
watch_api_data_string = self . _html_search_regex (
r ' <div[^>]+id= " watchAPIDataContainer " [^>]+>([^<]+)</div> ' ,
webpage , ' watch api data ' , default = None )
watch_api_data = self . _parse_json ( watch_api_data_string , video_id ) if watch_api_data_string else { }
video_detail = watch_api_data . get ( ' videoDetail ' , { } )
thumbnail = (
2017-08-06 17:07:06 +02:00
get_video_info ( [ ' thumbnail_url ' , ' thumbnailURL ' ] ) or
2015-04-29 20:24:05 +02:00
self . _html_search_meta ( ' image ' , webpage , ' thumbnail ' , default = None ) or
video_detail . get ( ' thumbnail ' ) )
2017-08-06 17:07:06 +02:00
description = get_video_info ( ' description ' )
2015-04-29 20:24:05 +02:00
2017-08-06 17:07:06 +02:00
timestamp = ( parse_iso8601 ( get_video_info ( ' first_retrieve ' ) ) or
unified_timestamp ( get_video_info ( ' postedDateTime ' ) ) )
2015-04-29 20:24:05 +02:00
if not timestamp :
match = self . _html_search_meta ( ' datePublished ' , webpage , ' date published ' , default = None )
if match :
timestamp = parse_iso8601 ( match . replace ( ' + ' , ' :00+ ' ) )
if not timestamp and video_detail . get ( ' postedAt ' ) :
timestamp = parse_iso8601 (
video_detail [ ' postedAt ' ] . replace ( ' / ' , ' - ' ) ,
delimiter = ' ' , timezone = datetime . timedelta ( hours = 9 ) )
2017-08-06 17:07:06 +02:00
view_count = int_or_none ( get_video_info ( [ ' view_counter ' , ' viewCount ' ] ) )
2015-04-29 20:24:05 +02:00
if not view_count :
match = self . _html_search_regex (
r ' >Views: <strong[^>]*>([^<]+)</strong> ' ,
webpage , ' view count ' , default = None )
if match :
view_count = int_or_none ( match . replace ( ' , ' , ' ' ) )
view_count = view_count or video_detail . get ( ' viewCount ' )
2017-08-06 17:07:06 +02:00
comment_count = ( int_or_none ( get_video_info ( ' comment_num ' ) ) or
video_detail . get ( ' commentCount ' ) or
try_get ( api_data , lambda x : x [ ' thread ' ] [ ' commentCount ' ] ) )
2015-04-29 20:24:05 +02:00
if not comment_count :
match = self . _html_search_regex (
r ' >Comments: <strong[^>]*>([^<]+)</strong> ' ,
webpage , ' comment count ' , default = None )
if match :
comment_count = int_or_none ( match . replace ( ' , ' , ' ' ) )
duration = ( parse_duration (
2017-08-06 17:07:06 +02:00
get_video_info ( ' length ' ) or
2015-04-29 20:24:05 +02:00
self . _html_search_meta (
' video:duration ' , webpage , ' video duration ' , default = None ) ) or
2017-08-06 17:07:06 +02:00
video_detail . get ( ' length ' ) or
get_video_info ( ' duration ' ) )
2015-04-29 20:24:05 +02:00
2017-08-06 17:07:06 +02:00
webpage_url = get_video_info ( ' watch_url ' ) or url
2014-07-04 17:05:46 +02:00
2017-10-15 05:40:57 +02:00
# Note: cannot use api_data.get('owner', {}) because owner may be set to "null"
# in the JSON, which will cause None to be returned instead of {}.
owner = try_get ( api_data , lambda x : x . get ( ' owner ' ) , dict ) or { }
2017-08-06 17:07:06 +02:00
uploader_id = get_video_info ( [ ' ch_id ' , ' user_id ' ] ) or owner . get ( ' id ' )
uploader = get_video_info ( [ ' ch_name ' , ' user_nickname ' ] ) or owner . get ( ' nickname ' )
2013-11-23 10:19:44 +01:00
2015-04-29 20:24:05 +02:00
return {
2014-03-27 21:01:09 +01:00
' id ' : video_id ,
2014-07-04 17:05:46 +02:00
' title ' : title ,
2017-08-06 18:19:46 +02:00
' formats ' : formats ,
2014-07-04 17:05:46 +02:00
' thumbnail ' : thumbnail ,
' description ' : description ,
' uploader ' : uploader ,
2015-04-29 18:24:24 +02:00
' timestamp ' : timestamp ,
2014-07-04 17:05:46 +02:00
' uploader_id ' : uploader_id ,
' view_count ' : view_count ,
' comment_count ' : comment_count ,
' duration ' : duration ,
' webpage_url ' : webpage_url ,
2013-11-23 10:19:44 +01:00
}
2014-10-29 11:04:48 +01:00
class NiconicoPlaylistIE ( InfoExtractor ) :
2016-09-08 13:29:05 +02:00
_VALID_URL = r ' https?://(?:www \ .)?nicovideo \ .jp/mylist/(?P<id> \ d+) '
2014-10-29 11:04:48 +01:00
_TEST = {
' url ' : ' http://www.nicovideo.jp/mylist/27411728 ' ,
' info_dict ' : {
' id ' : ' 27411728 ' ,
' title ' : ' AKB48のオールナイトニッポン ' ,
} ,
' playlist_mincount ' : 225 ,
}
def _real_extract ( self , url ) :
list_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , list_id )
entries_json = self . _search_regex ( r ' Mylist \ .preload \ ( \ d+, ( \ [.* \ ]) \ ); ' ,
2014-11-23 21:39:15 +01:00
webpage , ' entries ' )
2014-10-29 11:04:48 +01:00
entries = json . loads ( entries_json )
entries = [ {
' _type ' : ' url ' ,
' ie_key ' : NiconicoIE . ie_key ( ) ,
2014-11-08 14:53:23 +01:00
' url ' : ( ' http://www.nicovideo.jp/watch/ %s ' %
2014-11-23 21:39:15 +01:00
entry [ ' item_data ' ] [ ' video_id ' ] ) ,
2014-10-29 11:04:48 +01:00
} for entry in entries ]
return {
' _type ' : ' playlist ' ,
' title ' : self . _search_regex ( r ' \ s+name: " (.*?) " ' , webpage , ' title ' ) ,
' id ' : list_id ,
' entries ' : entries ,
}
2018-09-08 22:46:40 +02:00
2018-09-09 00:00:05 +02:00
# USAGE: youtube-dl "nicosearch<NUMBER OF ENTRIES>:<SEARCH STRING>"
2018-09-08 22:46:40 +02:00
class NicovideoIE ( SearchInfoExtractor ) :
IE_DESC = ' Nico video search '
_MAX_RESULTS = 100000
_SEARCH_KEY = ' nicosearch '
2018-09-09 00:00:05 +02:00
2018-09-08 22:46:40 +02:00
def _get_n_results ( self , query , n ) :
""" Get a specified number of results for a query """
entries = [ ]
currDate = datetime . datetime . now ( ) . date ( )
2018-09-09 00:00:05 +02:00
while True :
2018-09-08 22:46:40 +02:00
search_url = " http://www.nicovideo.jp/search/ %s ?sort=f&order=d " % ( query )
print ( search_url )
r = self . _get_entries_for_date ( search_url , query , currDate )
2018-09-09 00:00:05 +02:00
# did we gather more entries in the last few pages than were asked for? If so, only add as many as are needed to reach the desired number.
2018-09-08 22:46:40 +02:00
m = n - len ( entries )
entries + = r [ 0 : min ( m , len ( r ) ) ]
2018-09-09 00:00:05 +02:00
# for a given search, nicovideo will show a maximum of 50 pages. My way around this is specifying a date for the search, down to the date, which for the most part
# is a guarantee that the number of pages in the search results will not exceed 50. For any given search for a day, we extract everything available, and move on, until
# finding as many entries as were requested.
2018-09-08 22:46:40 +02:00
currDate - = datetime . timedelta ( days = 1 )
if ( len ( entries ) > = n ) :
break
2018-09-09 00:00:05 +02:00
2018-09-08 22:46:40 +02:00
return {
' _type ' : ' playlist ' ,
' id ' : query ,
' entries ' : entries
2018-09-09 00:00:05 +02:00
}
2018-09-08 22:46:40 +02:00
2018-09-09 00:00:05 +02:00
def _get_entries_for_date ( self , url , query , date , pageNumber = 1 ) :
2018-09-08 22:46:40 +02:00
link = url + " &page= " + str ( pageNumber ) + " &start= " + str ( date ) + " &end= " + str ( date )
results = self . _download_webpage ( link , query , note = ' Downloading results page %s for date %s ' % ( pageNumber , date ) )
entries = [ ]
2018-09-09 00:00:05 +02:00
r = re . findall ( r ''' <li.*(?!</li>) data-video-id=[ ' | " ](..[0-9] { 1,8}) ''' , results )
2018-09-08 22:46:40 +02:00
for item in r :
e = self . url_result ( " http://www.nicovideo.jp/watch/ " + str ( item ) , ' Niconico ' )
entries . append ( e )
2018-09-09 00:00:05 +02:00
# each page holds a maximum of 32 entries. If we've seen 32 entries on the current page,
# it's possible there may be another, so we can check. It's a little awkward, but it works.
2018-09-08 22:46:40 +02:00
if ( len ( r ) > = 32 ) :
entries + = self . _get_entries_for_date ( url , query , date , pageNumber + 1 )
return entries