[cctv] add support for cctv.com to download all videos of one channel

2024-12-02 13:27:56 +01:00 · 2018-10-14 16:56:50 +08:00 · 2018-10-14 16:56:50 +08:00 · b1e4e389f3
commit b1e4e389f3
parent 5d90a8a5f3
2 changed files with 55 additions and 1 deletions
--- a/youtube_dl/extractor/cctv.py
+++ b/youtube_dl/extractor/cctv.py
@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals

+import json
 import re

 from .common import InfoExtractor
@ -189,3 +190,56 @@ class CCTVIE(InfoExtractor):
            'duration': duration,
            'formats': formats,
        }
+
+
+class CCTVChannelIE(InfoExtractor):
+    IE_DESC = '央视网 栏目'
+    _VALID_URL = r'http://tv.cctv.com/lm/(?P<id>[0-9A-Za-z-_]+)/?'
+    _TESTS = [{
+        'url': 'http://tv.cctv.com/lm/d10fys/',
+        'only_matching': True,
+    }]
+
+    def _entries(self, page, playlist_id):
+        re_req_item_id = re.compile(r'setItemByid[a-zA-Z0-9]+')
+        re_req_id_tmp = re.compile(r'videolistByColumnId\?id=[a-zA-Z0-9]+(?=&)')
+        re_req_id = re.compile(r'(?<=id=)[a-zA-Z0-9]+')
+        count_per_page = 100
+
+        req_item_id = re_req_item_id.findall(page)[0]
+        req_id = re_req_id.findall(re_req_id_tmp.findall(page)[0])[0]
+
+        page = 0
+        while True:
+            page += 1
+
+            url_template = "http://api.cntv.cn/lanmu/videolistByColumnId" + \
+                           "?id={}&serviceId=tvcctv&type=0&n={}&t=jsonp&cb={}&p=".format(
+                               req_id, count_per_page, req_item_id)
+            content = self._download_webpage(url_template + str(page), playlist_id)
+            if not content:
+                break
+
+            content = content.rstrip()
+            req_item_id = re_req_item_id.findall(content)[0]
+
+            video_list = json.loads(content[(len(req_item_id) + 1):-2])["response"]["docs"]
+
+            for content_dict in video_list:
+                video_id, video_title, video_url = \
+                    content_dict["videoId"], content_dict["videoTitle"], content_dict["videoUrl"]
+                yield self.url_result(video_url, ie="CCTV", video_id=video_id, video_title=video_title)
+
+            if len(video_list) < count_per_page:
+                break
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        channel_page = self._download_webpage(
+            url, channel_id,
+            'Downloading channel page', fatal=False)
+        if channel_page is False:
+            raise Exception('CCTV said: Cannot connect to {}'.format(url), expected=True)
+
+        return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -183,7 +183,7 @@ from .cbsnews import (
 from .cbssports import CBSSportsIE
 from .ccc import CCCIE
 from .ccma import CCMAIE
-from .cctv import CCTVIE
+from .cctv import CCTVIE, CCTVChannelIE
 from .cda import CDAIE
 from .ceskatelevize import (
    CeskaTelevizeIE,