mastodon extractor, closes #18226 #25886

2024-11-26 02:14:32 +01:00 · 2020-07-23 10:34:09 +02:00 · 2020-07-23 10:34:09 +02:00 · 0b3b53128a
commit 0b3b53128a
parent a115e07594
2 changed files with 112 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -601,6 +601,7 @@ from .markiza import (
    MarkizaPageIE,
 )
 from .massengeschmacktv import MassengeschmackTVIE
 from .mastodon import MastodonIE
 from .matchtv import MatchTVIE
 from .mdr import MDRIE
 from .mediaset import MediasetIE
--- a/youtube_dl/extractor/mastodon.py
+++ b/youtube_dl/extractor/mastodon.py
@ -0,0 +1,111 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import (
    InfoExtractor,
    ExtractorError,
 )
 from ..utils import (
    clean_html,
    str_or_none,
 )
 import re
 # this infoextractor is for services implementing the Mastodon API, not just Mastodon
 # supported services (possibly more already work or could):
 # - Mastodon - https://github.com/tootsuite/mastodon
 # - Glitch (a fork of Mastodon) - https://github.com/glitch-soc/mastodon
 # - Pleroma - https://git.pleroma.social/pleroma/pleroma
 # - Gab Social (a fork of Mastodon) - https://code.gab.com/gab/social/gab-social/
 class MastodonIE(InfoExtractor):
    IE_NAME = 'mastodon'
    _VALID_URL = r'https?://(?P<host>[^/\s]+)(?<!facebook\.com)/(?:(?:@[a-zA-Z0-9_]+|[a-zA-Z0-9_]+/posts|users/[a-zA-Z0-9_]+/statuses)|notice|objects)/(?P<id>[0-9a-zA-Z-]+)'
    _TESTS = [{
        # mastodon, video description
        "url": "https://mastodon.technology/@BadAtNames/104254332187004304",
        "info_dict": {
            "id": "104254332187004304",
            "title": "BadAtNames - Mfw trump supporters complain about twitter",
            "ext": "mp4",
            "description": "md5:53f4428d4dc7e25a8255cf2a08488f2e",
        },
    }, {
        # pleroma, /objects/ redirect, empty content
        "url": "https://fedi.valkyrie.world/objects/386d2d68-090f-492e-81bd-8d32a3a65627",
        "info_dict": {
            "id": "9xLMO1BcEEbaM54LBI",
            "title": "VD-15 - ",
            "ext": "mp4",
            "description": "video0_4_1.mp4",
        },
    }, {
        # pleroma, multiple videos in single post (can't define tests for _type multi_video)
        "url": "https://donotsta.re/notice/9xN1v6yM7WhzE7aIIC",
        "only_matching": True,
    }, {
        # gab social
        "url": "https://gab.com/ACT1TV/posts/104450493441154721",
        "info_dict": {
            "id": "104450493441154721",
            "title": "Bill Blaze - He shoots, he scores and the crowd went wild.... #Animal #Sports",
            "ext": "mp4",
        },
    }]
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        host, id = mobj.group('host', 'id')
        if '/objects/' in url:
            page = self._download_webpage(url, '%s@%s' % (id, host), expected_status=302)
            real_url = self._og_search_property('url', page, default=None)
            if real_url:
                return {
                    "_type": "url",
                    "ie_key": "Mastodon",
                    "url": real_url,
                }
        metadata = self._download_json('https://%s/api/v1/statuses/%s' % (host, id), '%s@%s' % (id, host))
        if not metadata['media_attachments']:
            raise ExtractorError('No attached medias')
        medias = []
        for media in metadata['media_attachments']:
            if media['type'] == 'video':
                medias.append(media)
        title = '%s - %s' % (str_or_none(metadata['account']['display_name'] or metadata['account']['acct']), clean_html(str_or_none(metadata['content'])))
        if len(medias) == 0:
            raise ExtractorError('No audio/video attachments')
        elif len(medias) == 1:
            media = medias[0]
            return {
                "id": id,
                "title": title,
                "description": str_or_none(media['description']),
                "url": str_or_none(media['url']),
                "thumbnail": str_or_none(media['preview_url']),
            }
        else:
            entries = []
            for media in medias:
                entries.append({
                    "id": id,
                    "title": str_or_none(media['description']) or title,
                    "url": str_or_none(media['url']),
                    "thumbnail": str_or_none(media['preview_url']),
                })
            return {
                "_type": "multi_video",
                "id": id,
                "title": title,
                "entries": entries,
            }