[bannedvideo] Add new extractor

2024-12-24 15:27:55 +01:00 · 2020-04-02 03:09:05 +01:00 · 2020-04-02 03:09:05 +01:00 · ca8d548ee3
commit ca8d548ee3
parent 049c0486bb
1 changed files with 220 additions and 0 deletions
--- a/youtube_dl/extractor/bannedvideo.py
+++ b/youtube_dl/extractor/bannedvideo.py
@ -0,0 +1,220 @@
+from __future__ import unicode_literals
+
+import json
+from datetime import datetime
+
+from .common import InfoExtractor
+
+
+class BannedVideoIE(InfoExtractor):
+    _GRAPHQL_API = 'https://api.infowarsmedia.com/graphql'
+    _GRAPHQL_HEADERS = {
+        'Content-Type': 'application/json; charset=utf-8'
+    }
+    _GRAPHQL_GETVIDEO_QUERY = '''
+query GetVideo($id: String!) {
+    getVideo(id: $id) {
+        ...DisplayVideoFields
+        streamUrl
+        directUrl
+        unlisted
+        live
+        tags {
+            _id
+            name
+            __typename
+        }
+        sale {
+            _id
+            text
+            __typename
+        }
+        __typename
+    }
+}
+fragment DisplayVideoFields on Video {
+    _id
+    title
+    summary
+    playCount
+    largeImage
+    embedUrl
+    published
+    videoDuration
+    channel {
+        _id
+        title
+        avatar
+        __typename
+    }
+    createdAt
+    __typename
+}'''
+    _GRAPHQL_GETCOMMENTS_QUERY = '''
+query GetVideoComments($id: String!, $limit: Float, $offset: Float) {
+    getVideoComments(id: $id, limit: $limit, offset: $offset) {
+        ...VideoComment
+        replyCount
+        __typename
+    }
+}
+fragment VideoComment on Comment {
+    _id
+    content
+    liked
+    user {
+        _id
+        username
+        __typename
+    }
+    voteCount {
+        positive
+        __typename
+    }
+    linkedUser {
+        _id
+        username
+        __typename
+    }
+    createdAt
+    __typename
+}'''
+    _GRAPHQL_GETCOMMENTSREPLIES_QUERY = '''
+query GetCommentReplies($id: String!, $limit: Float, $offset: Float) {
+    getCommentReplies(id: $id, limit: $limit, offset: $offset) {
+        ...VideoComment
+        replyTo {
+            _id
+            __typename
+        }
+        __typename
+    }
+}
+fragment VideoComment on Comment {
+    _id
+    content
+    liked
+    user {
+        _id
+        username
+        __typename
+    }
+    voteCount {
+        positive
+        __typename
+    }
+    linkedUser {
+        _id
+        username
+        __typename
+    }
+    createdAt
+    __typename
+}'''
+    _VALID_URL = r'https?://(?:www\.)?banned\.video/watch\?id=(?P<id>[0-f]{24})'
+    _TEST = {
+        'url': 'https://banned.video/watch?id=5e7a859644e02200c6ef5f11',
+        'md5': '14b6e81d41beaaee2215cd75c6ed56e4',
+        'info_dict': {
+            'id': '5e7a859644e02200c6ef5f11',
+            'ext': 'mp4',
+            'title': 'China Discovers Origin of Corona Virus: Issues Emergency Statement',
+            'thumbnail': r're:^https?://(?:www\.)?assets\.infowarsmedia.com/images/',
+            'description': 'The Chinese Communist Party Official Spokesperson At the Ministry of Truth Released Their Statement Exclusively To Alex Jones and Infowars.com',
+            'upload_date': '20200324',
+            'timestamp': 1585084295.064,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_info = self._download_json(
+            self._GRAPHQL_API,
+            video_id,
+            headers=self._GRAPHQL_HEADERS,
+            data=json.dumps({
+                'variables': {
+                    'id': video_id
+                },
+                'operationName': 'GetVideo',
+                'query': self._GRAPHQL_GETVIDEO_QUERY
+            }).encode('utf8'),
+        ).get('data').get('getVideo')
+        video_comments = self._download_json(
+            self._GRAPHQL_API,
+            video_id,
+            headers=self._GRAPHQL_HEADERS,
+            data=json.dumps({
+                'variables': {
+                    'id': video_id
+                },
+                'operationName': 'GetVideoComments',
+                'query': self._GRAPHQL_GETCOMMENTS_QUERY
+            }).encode('utf8'),
+        ).get('data').get('getVideoComments')
+        upload_date = datetime.fromisoformat(video_info.get('createdAt')[:-1])
+        metadata = {}
+        metadata['id'] = video_id
+        metadata['title'] = video_info.get('title')[:-1]
+        metadata['description'] = video_info.get('summary')
+        metadata['channel'] = video_info.get('channel').get('title')
+        metadata['channel_id'] = video_info.get('channel').get('_id')
+        metadata['view_count'] = video_info.get('playCount')
+        metadata['thumbnail'] = video_info.get('largeImage')
+        metadata['duration'] = video_info.get('videoDuration')
+        metadata['upload_date'] = upload_date.strftime('%Y%m%d')
+        metadata['timestamp'] = upload_date.timestamp()
+        tags = []
+
+        for tag in video_info.get('tags'):
+            tags.append(tag.get('name'))
+
+        metadata['tags'] = tags
+
+        is_live = video_info.get('live')
+
+        if is_live:
+            formats = []
+            formats.append(self._extract_m3u8_formats(video_info.get('streamUrl'), video_id, entry_protocol='hls', live=True))
+            metadata['formats'] = formats
+        else:
+            metadata['url'] = video_info.get('directUrl')
+
+        metadata['is_live'] = is_live
+        comments = []
+
+        for comment in video_comments:
+            comment_date = datetime.fromisoformat(comment.get('createdAt')[:-1])
+            comments.append({
+                'id': comment.get('_id'),
+                'text': comment.get('content'),
+                'author': comment.get('user').get('username'),
+                'author_id': comment.get('user').get('_id'),
+                'timestamp': comment_date.timestamp(),
+                'parent': 'root'
+            })
+            if comment.get('replyCount') > 0:
+                replies = self._download_json(
+                    self._GRAPHQL_API,
+                    video_id,
+                    headers=self._GRAPHQL_HEADERS,
+                    data=json.dumps({
+                        'variables': {
+                            'id': comment.get('_id')
+                        },
+                        'operationName': 'GetCommentReplies',
+                        'query': self._GRAPHQL_GETCOMMENTSREPLIES_QUERY
+                    }).encode('utf8'),
+                ).get('data').get('getCommentReplies')
+                for reply in replies:
+                    reply_date = datetime.fromisoformat(reply.get('createdAt')[:-1])
+                    comments.append({
+                        'id': reply.get('_id'),
+                        'text': reply.get('content'),
+                        'author': reply.get('user').get('username'),
+                        'author_id': reply.get('user').get('_id'),
+                        'timestamp': reply_date.timestamp(),
+                        'parent': comment.get('_id')
+                    })
+        metadata["comments"] = comments
+        return metadata