Merge 335f037e80 into d65d89183f

2020-09-24 09:46:57 +02:00 · 2020-09-24 09:46:57 +02:00 · 9a3b2e68c1
parent d65d89183f 335f037e80
commit 9a3b2e68c1
2 changed files with 56 additions and 2 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -728,7 +728,7 @@ from .nick import (
    NickNightIE,
    NickRuIE,
 )
-from .niconico import NiconicoIE, NiconicoPlaylistIE
+from .niconico import NiconicoIE, NiconicoPlaylistIE, NicovideoIE
 from .ninecninemedia import NineCNineMediaIE
 from .ninegag import NineGagIE
 from .ninenow import NineNowIE
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@ -3,8 +3,9 @@ from __future__ import unicode_literals

 import json
 import datetime
+import re

-from .common import InfoExtractor
+from .common import InfoExtractor, SearchInfoExtractor
 from ..compat import (
    compat_parse_qs,
    compat_urlparse,
@ -468,3 +469,56 @@ class NiconicoPlaylistIE(InfoExtractor):
            'id': list_id,
            'entries': entries,
        }
+
+
+# USAGE: youtube-dl "nicosearch<NUMBER OF ENTRIES>:<SEARCH STRING>"
+class NicovideoIE(SearchInfoExtractor):
+    IE_DESC = 'Nico video search'
+    _MAX_RESULTS = 100000
+    _SEARCH_KEY = 'nicosearch'
+
+    def _get_n_results(self, query, n):
+        """Get a specified number of results for a query"""
+        entries = []
+        currDate = datetime.datetime.now().date()
+
+        while True:
+            search_url = "http://www.nicovideo.jp/search/%s" % query
+            r = self._get_entries_for_date(search_url, query, currDate)
+
+            # did we gather more entries in the last few pages than were asked for? If so, only add as many as are needed to reach the desired number.
+            m = n - len(entries)
+            entries += r[0:min(m, len(r))]
+
+            # for a given search, nicovideo will show a maximum of 50 pages. My way around this is specifying a date for the search, down to the date, which for the most part
+            # is a guarantee that the number of pages in the search results will not exceed 50. For any given search for a day, we extract everything available, and move on, until
+            # finding as many entries as were requested.
+            currDate -= datetime.timedelta(days=1)
+            if(len(entries) >= n or currDate < datetime.date(2007, 1, 1)):
+                break
+
+        return {
+            '_type': 'playlist',
+            'id': query,
+            'entries': entries
+        }
+
+    def _get_entries_for_date(self, url, query, date, pageNumber=1):
+        while True:
+            link = url + "?page=" + str(pageNumber) + "&start=" + str(date) + "&end=" + str(date)
+            results = self._download_webpage(link, "None", query={"Search_key": query}, note='Extracting results from page %s for date %s' % (pageNumber, date))
+            entries = []
+            r = re.findall(r'(?<=data-video-id=)["\']?(?P<videoid>.*?)(?=["\'])', results)
+
+            for item in r:
+                e = self.url_result("http://www.nicovideo.jp/watch/" + item, 'Niconico')
+                entries.append(e)
+
+            # each page holds a maximum of 32 entries. If we've seen 32 entries on the current page,
+            # it's possible there may be another, so we can check. It's a little awkward, but it works.
+            if(len(r) < 32):
+                break
+
+            pageNumber += 1
+
+        return entries