[ssa] Add extractor (Closes #5169)

2024-11-26 10:24:33 +01:00 · 2015-03-11 21:15:36 +06:00 · 2015-03-11 21:15:36 +06:00 · c792b5011f
commit c792b5011f
parent 32aaeca775
2 changed files with 59 additions and 0 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -460,6 +460,7 @@ from .sport5 import Sport5IE
 from .sportbox import SportBoxIE
 from .sportdeutschland import SportDeutschlandIE
 from .srmediathek import SRMediathekIE
 from .ssa import SSAIE
 from .stanfordoc import StanfordOpenClassroomIE
 from .steam import SteamIE
 from .streamcloud import StreamcloudIE
--- a/youtube_dl/extractor/ssa.py
+++ b/youtube_dl/extractor/ssa.py
@ -0,0 +1,58 @@
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..utils import (
    unescapeHTML,
    parse_duration,
 )
 class SSAIE(InfoExtractor):
    _VALID_URL = r'http://ssa\.nls\.uk/film/(?P<id>\d+)'
    _TEST = {
        'url': 'http://ssa.nls.uk/film/3561',
        'info_dict': {
            'id': '3561',
            'ext': 'flv',
            'title': 'SHETLAND WOOL',
            'description': 'md5:c5afca6871ad59b4271e7704fe50ab04',
            'duration': 900,
            'thumbnail': 're:^https?://.*\.jpg$',
        },
        'params': {
            # rtmp download
            'skip_download': True,
        },
    }
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        streamer = self._search_regex(
            r"'streamer'\s*,\S*'(rtmp[^']+)'", webpage, 'streamer')
        play_path = self._search_regex(
            r"'file'\s*,\s*'([^']+)'", webpage, 'file').rpartition('.')[0]
        def search_field(field_name, fatal=False):
            return self._search_regex(
                r'<span\s+class="field_title">%s:</span>\s*<span\s+class="field_content">([^<]+)</span>' % field_name,
                webpage, 'title', fatal=fatal)
        title = unescapeHTML(search_field('Title', fatal=True)).strip('()[]')
        description = unescapeHTML(search_field('Description'))
        duration = parse_duration(search_field('Running time'))
        thumbnail = self._search_regex(
            r"'image'\s*,\s*'([^']+)'", webpage, 'thumbnails', fatal=False)
        return {
            'id': video_id,
            'url': streamer,
            'play_path': play_path,
            'ext': 'flv',
            'title': title,
            'description': description,
            'duration': duration,
            'thumbnail': thumbnail,
        }