LinkedIn: Support download without login (#21860)) and subtitles (#21879)

2025-02-18 18:17:55 +01:00 · 2019-12-24 12:34:43 +02:00 · 2019-12-24 12:34:43 +02:00 · 04a93ac37a
commit 04a93ac37a
parent 2dbc0967f2
2 changed files with 61 additions and 5 deletions
--- a/test/test_subtitles.py
+++ b/test/test_subtitles.py
@ -28,6 +28,7 @@ from youtube_dl.extractor import (
    RTVEALaCartaIE,
    FunnyOrDieIE,
    DemocracynowIE,
    LinkedInLearningIE,
 )
@ -219,6 +220,18 @@ class TestLyndaSubtitles(BaseTestSubtitles):
        self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7')
 class TestLinkedInSubtitles(BaseTestSubtitles):
    url = 'https://www.linkedin.com/learning/programming-foundations-fundamentals/welcome?autoplay=true'
    IE = LinkedInLearningIE
    def test_allsubtitles(self):
        self.DL.params['writesubtitles'] = True
        self.DL.params['allsubtitles'] = True
        subtitles = self.getSubtitles()
        self.assertEqual(set(subtitles.keys()), set(['en']))
        self.assertEqual(md5(subtitles['en']), 'b329730e94e7fbdbac0307b3cad1221a')
 class TestNPOSubtitles(BaseTestSubtitles):
    url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860'
    IE = NPOIE
--- a/youtube_dl/extractor/linkedin.py
+++ b/youtube_dl/extractor/linkedin.py
@ -8,6 +8,7 @@ from ..utils import (
    ExtractorError,
    float_or_none,
    int_or_none,
    srt_subtitles_timecode,
    urlencode_postdata,
    urljoin,
 )
@ -31,10 +32,16 @@ class LinkedInLearningBaseIE(InfoExtractor):
            })
            sub = ' %dp' % resolution
        api_url = 'https://www.linkedin.com/learning-api/detailedCourses'
        cookies = self._get_cookies(api_url)
        headers = {}
        if 'JSESSIONID' in cookies:
            headers['Csrf-Token'] = cookies['JSESSIONID'].value
        return self._download_json(
-            api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={
+            api_url, video_slug, 'Downloading%s JSON metadata' % sub,
-                'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value,
+            headers=headers,
-            }, query=query)['elements'][0]
+            query=query)['elements'][0]
    def _get_urn_id(self, video_data):
        urn = video_data.get('urn')
@ -47,12 +54,14 @@ class LinkedInLearningBaseIE(InfoExtractor):
        return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug)
    def _real_initialize(self):
        # We need the JSESSIONID from the login page, even if we're not logging in
        login_page = self._download_webpage(
            self._LOGIN_URL, None, 'Downloading login page')
        email, password = self._get_login_info()
        if email is None:
            return
        login_page = self._download_webpage(
            self._LOGIN_URL, None, 'Downloading login page')
        action_url = urljoin(self._LOGIN_URL, self._search_regex(
            r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url',
            default='https://www.linkedin.com/uas/login-submit', group='url'))
@ -126,6 +135,8 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
        self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr'))
        subtitles = self.extract_subtitles(video_data)
        return {
            'id': self._get_video_id(video_data, course_slug, video_slug),
            'title': title,
@ -133,8 +144,40 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
            'thumbnail': video_data.get('defaultThumbnail'),
            'timestamp': float_or_none(video_data.get('publishedOn'), 1000),
            'duration': int_or_none(video_data.get('durationInSeconds')),
            'subtitles': subtitles,
        }
    def _get_subtitles(self, video_data):
        transcript = video_data.get('transcript')
        if not transcript:
            return {}
        lines = transcript.get('lines')
        if not lines:
            return {}
        fixed_subs = self._fix_subtitles(lines)
        if fixed_subs:
            return {'en': [{'ext': 'srt', 'data': fixed_subs}]}
        return {}
    def _fix_subtitles(self, lines):
        srt = ''
        seq_counter = 0
        for pos in range(0, len(lines) - 1):
            seq_current = lines[pos]
            seq_next = lines[pos + 1]
            appear_time = self._timecode(seq_current['transcriptStartAt'])
            disappear_time = self._timecode(seq_next['transcriptStartAt'])
            text = seq_current['caption'].strip()
            if text:
                seq_counter += 1
                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (seq_counter, appear_time, disappear_time, text)
        return srt
    def _timecode(self, ms):
        return srt_subtitles_timecode(ms / 1000.0)
 class LinkedInLearningCourseIE(LinkedInLearningBaseIE):
    IE_NAME = 'linkedin:learning:course'