Added nicovideo search extractor

2025-01-07 13:47:54 +01:00 · 2018-09-08 13:46:40 -07:00 · 2018-09-08 13:46:40 -07:00 · 9ccab97034
commit 9ccab97034
parent 2e4350eec6
3 changed files with 61 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -77,8 +77,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo
                                     repairs broken URLs, but emits an error if
                                     this is not possible instead of searching.
    --ignore-config                  Do not read configuration files. When given
-                                     in the global configuration file
-                                     /etc/youtube-dl.conf: Do not read the user
+                                     in the global configuration file /etc
+                                     /youtube-dl.conf: Do not read the user
                                     configuration in ~/.config/youtube-
                                     dl/config (%APPDATA%/youtube-dl/config.txt
                                     on Windows)
@ -108,8 +108,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo
                                     proxy specified by --proxy (or none, if the
                                     option is not present) is used for the
                                     actual downloading.
-    --geo-bypass                     Bypass geographic restriction via faking
-                                     X-Forwarded-For HTTP header
+    --geo-bypass                     Bypass geographic restriction via faking X
+                                     -Forwarded-For HTTP header
    --no-geo-bypass                  Do not bypass geographic restriction via
                                     faking X-Forwarded-For HTTP header
    --geo-bypass-country CODE        Force bypass geographic restriction with
@ -258,12 +258,11 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo
                                     jar in
    --cache-dir DIR                  Location in the filesystem where youtube-dl
                                     can store some downloaded information
-                                     permanently. By default
-                                     $XDG_CACHE_HOME/youtube-dl or
-                                     ~/.cache/youtube-dl . At the moment, only
-                                     YouTube player files (for videos with
-                                     obfuscated signatures) are cached, but that
-                                     may change.
+                                     permanently. By default $XDG_CACHE_HOME
+                                     /youtube-dl or ~/.cache/youtube-dl . At the
+                                     moment, only YouTube player files (for
+                                     videos with obfuscated signatures) are
+                                     cached, but that may change.
    --no-cache-dir                   Disable filesystem caching
    --rm-cache-dir                   Delete all filesystem cache files

--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -730,7 +730,7 @@ from .nick import (
    NickNightIE,
    NickRuIE,
 )
-from .niconico import NiconicoIE, NiconicoPlaylistIE
+from .niconico import NiconicoIE, NiconicoPlaylistIE, NicovideoIE
 from .ninecninemedia import NineCNineMediaIE
 from .ninegag import NineGagIE
 from .ninenow import NineNowIE
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@ -3,8 +3,10 @@ from __future__ import unicode_literals

 import json
 import datetime
+import re
+import datetime

-from .common import InfoExtractor
+from .common import InfoExtractor, SearchInfoExtractor
 from ..compat import (
    compat_parse_qs,
    compat_urlparse,
@ -468,3 +470,51 @@ class NiconicoPlaylistIE(InfoExtractor):
            'id': list_id,
            'entries': entries,
        }
+
+    #USAGE: youtube-dl "nicosearch<NUMBER OF ENTRIES>:<SEARCH STRING>"
+class NicovideoIE(SearchInfoExtractor):
+    IE_DESC = 'Nico video search'
+    _MAX_RESULTS = 100000
+    _SEARCH_KEY = 'nicosearch'
+    def _get_n_results(self, query, n):
+        """Get a specified number of results for a query"""
+        entries = []
+        currDate = datetime.datetime.now().date()
+        
+	while True:
+            search_url = "http://www.nicovideo.jp/search/%s?sort=f&order=d" % (query)
+            print(search_url)
+            r = self._get_entries_for_date(search_url, query, currDate)
+
+            #did we gather more entries in the last few pages than were asked for? If so, only add as many as are needed to reach the desired number.
+            m = n - len(entries)
+            entries += r[0:min(m, len(r))]
+                    
+            #for a given search, nicovideo will show a maximum of 50 pages. My way around this is specifying a date for the search, down to the date, which for the most part
+            #is a guarantee that the number of pages in the search results will not exceed 50. For any given search for a day, we extract everything available, and move on, until
+            #finding as many entries as were requested.
+            currDate -= datetime.timedelta(days=1)
+            if(len(entries) >= n):
+                break
+    
+        return {
+            '_type': 'playlist',
+            'id': query,
+            'entries': entries
+            }
+
+    def _get_entries_for_date(self, url, query, date, pageNumber = 1):
+        link = url + "&page=" + str(pageNumber) + "&start=" + str(date) + "&end=" + str(date)
+        results = self._download_webpage(link, query, note='Downloading results page %s for date %s' % (pageNumber, date))
+        entries = []
+        r = re.findall(r'<a href="/watch/(..[0-9]{1,8})\?', results)
+        
+        for item in r:
+            e = self.url_result("http://www.nicovideo.jp/watch/" + str(item), 'Niconico')
+            entries.append(e)
+
+        #each page holds a maximum of 32 entries. If we've seen 32 entries on the current page,
+        #it's possible there may be another, so we can check. It's a little awkward, but it works.
+        if(len(r) >= 32):
+            entries += self._get_entries_for_date(url, query, date, pageNumber + 1)
+        return entries