From 9ccab970342ee043b422f0a67532d4d47c15d932 Mon Sep 17 00:00:00 2001 From: john Date: Sat, 8 Sep 2018 13:46:40 -0700 Subject: [PATCH] Added nicovideo search extractor --- README.md | 19 ++++++----- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/niconico.py | 52 +++++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index dd068a462..cb69ef6f6 100644 --- a/README.md +++ b/README.md @@ -77,8 +77,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo repairs broken URLs, but emits an error if this is not possible instead of searching. --ignore-config Do not read configuration files. When given - in the global configuration file - /etc/youtube-dl.conf: Do not read the user + in the global configuration file /etc + /youtube-dl.conf: Do not read the user configuration in ~/.config/youtube- dl/config (%APPDATA%/youtube-dl/config.txt on Windows) @@ -108,8 +108,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo proxy specified by --proxy (or none, if the option is not present) is used for the actual downloading. - --geo-bypass Bypass geographic restriction via faking - X-Forwarded-For HTTP header + --geo-bypass Bypass geographic restriction via faking X + -Forwarded-For HTTP header --no-geo-bypass Do not bypass geographic restriction via faking X-Forwarded-For HTTP header --geo-bypass-country CODE Force bypass geographic restriction with @@ -258,12 +258,11 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo jar in --cache-dir DIR Location in the filesystem where youtube-dl can store some downloaded information - permanently. By default - $XDG_CACHE_HOME/youtube-dl or - ~/.cache/youtube-dl . At the moment, only - YouTube player files (for videos with - obfuscated signatures) are cached, but that - may change. + permanently. By default $XDG_CACHE_HOME + /youtube-dl or ~/.cache/youtube-dl . At the + moment, only YouTube player files (for + videos with obfuscated signatures) are + cached, but that may change. --no-cache-dir Disable filesystem caching --rm-cache-dir Delete all filesystem cache files diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7dc569724..eb90049de 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -730,7 +730,7 @@ from .nick import ( NickNightIE, NickRuIE, ) -from .niconico import NiconicoIE, NiconicoPlaylistIE +from .niconico import NiconicoIE, NiconicoPlaylistIE, NicovideoIE from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .ninenow import NineNowIE diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 76b412ff1..e4d986f73 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -3,8 +3,10 @@ from __future__ import unicode_literals import json import datetime +import re +import datetime -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_parse_qs, compat_urlparse, @@ -468,3 +470,51 @@ class NiconicoPlaylistIE(InfoExtractor): 'id': list_id, 'entries': entries, } + + #USAGE: youtube-dl "nicosearch:" +class NicovideoIE(SearchInfoExtractor): + IE_DESC = 'Nico video search' + _MAX_RESULTS = 100000 + _SEARCH_KEY = 'nicosearch' + def _get_n_results(self, query, n): + """Get a specified number of results for a query""" + entries = [] + currDate = datetime.datetime.now().date() + + while True: + search_url = "http://www.nicovideo.jp/search/%s?sort=f&order=d" % (query) + print(search_url) + r = self._get_entries_for_date(search_url, query, currDate) + + #did we gather more entries in the last few pages than were asked for? If so, only add as many as are needed to reach the desired number. + m = n - len(entries) + entries += r[0:min(m, len(r))] + + #for a given search, nicovideo will show a maximum of 50 pages. My way around this is specifying a date for the search, down to the date, which for the most part + #is a guarantee that the number of pages in the search results will not exceed 50. For any given search for a day, we extract everything available, and move on, until + #finding as many entries as were requested. + currDate -= datetime.timedelta(days=1) + if(len(entries) >= n): + break + + return { + '_type': 'playlist', + 'id': query, + 'entries': entries + } + + def _get_entries_for_date(self, url, query, date, pageNumber = 1): + link = url + "&page=" + str(pageNumber) + "&start=" + str(date) + "&end=" + str(date) + results = self._download_webpage(link, query, note='Downloading results page %s for date %s' % (pageNumber, date)) + entries = [] + r = re.findall(r'= 32): + entries += self._get_entries_for_date(url, query, date, pageNumber + 1) + return entries