1
0
mirror of https://codeberg.org/polarisfm/youtube-dl synced 2024-11-30 04:08:01 +01:00

LinkedIn: Support download without login (#21860)) and subtitles (#21879)

This commit is contained in:
Erez Volk 2019-12-24 12:34:43 +02:00
parent 2dbc0967f2
commit 04a93ac37a
2 changed files with 61 additions and 5 deletions

View File

@ -28,6 +28,7 @@ from youtube_dl.extractor import (
RTVEALaCartaIE, RTVEALaCartaIE,
FunnyOrDieIE, FunnyOrDieIE,
DemocracynowIE, DemocracynowIE,
LinkedInLearningIE,
) )
@ -219,6 +220,18 @@ class TestLyndaSubtitles(BaseTestSubtitles):
self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7') self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7')
class TestLinkedInSubtitles(BaseTestSubtitles):
url = 'https://www.linkedin.com/learning/programming-foundations-fundamentals/welcome?autoplay=true'
IE = LinkedInLearningIE
def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['en']))
self.assertEqual(md5(subtitles['en']), 'b329730e94e7fbdbac0307b3cad1221a')
class TestNPOSubtitles(BaseTestSubtitles): class TestNPOSubtitles(BaseTestSubtitles):
url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860' url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860'
IE = NPOIE IE = NPOIE

View File

@ -8,6 +8,7 @@ from ..utils import (
ExtractorError, ExtractorError,
float_or_none, float_or_none,
int_or_none, int_or_none,
srt_subtitles_timecode,
urlencode_postdata, urlencode_postdata,
urljoin, urljoin,
) )
@ -31,10 +32,16 @@ class LinkedInLearningBaseIE(InfoExtractor):
}) })
sub = ' %dp' % resolution sub = ' %dp' % resolution
api_url = 'https://www.linkedin.com/learning-api/detailedCourses' api_url = 'https://www.linkedin.com/learning-api/detailedCourses'
cookies = self._get_cookies(api_url)
headers = {}
if 'JSESSIONID' in cookies:
headers['Csrf-Token'] = cookies['JSESSIONID'].value
return self._download_json( return self._download_json(
api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={ api_url, video_slug, 'Downloading%s JSON metadata' % sub,
'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value, headers=headers,
}, query=query)['elements'][0] query=query)['elements'][0]
def _get_urn_id(self, video_data): def _get_urn_id(self, video_data):
urn = video_data.get('urn') urn = video_data.get('urn')
@ -47,12 +54,14 @@ class LinkedInLearningBaseIE(InfoExtractor):
return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug) return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug)
def _real_initialize(self): def _real_initialize(self):
# We need the JSESSIONID from the login page, even if we're not logging in
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
email, password = self._get_login_info() email, password = self._get_login_info()
if email is None: if email is None:
return return
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
action_url = urljoin(self._LOGIN_URL, self._search_regex( action_url = urljoin(self._LOGIN_URL, self._search_regex(
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url', r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url',
default='https://www.linkedin.com/uas/login-submit', group='url')) default='https://www.linkedin.com/uas/login-submit', group='url'))
@ -126,6 +135,8 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr')) self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr'))
subtitles = self.extract_subtitles(video_data)
return { return {
'id': self._get_video_id(video_data, course_slug, video_slug), 'id': self._get_video_id(video_data, course_slug, video_slug),
'title': title, 'title': title,
@ -133,8 +144,40 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
'thumbnail': video_data.get('defaultThumbnail'), 'thumbnail': video_data.get('defaultThumbnail'),
'timestamp': float_or_none(video_data.get('publishedOn'), 1000), 'timestamp': float_or_none(video_data.get('publishedOn'), 1000),
'duration': int_or_none(video_data.get('durationInSeconds')), 'duration': int_or_none(video_data.get('durationInSeconds')),
'subtitles': subtitles,
} }
def _get_subtitles(self, video_data):
transcript = video_data.get('transcript')
if not transcript:
return {}
lines = transcript.get('lines')
if not lines:
return {}
fixed_subs = self._fix_subtitles(lines)
if fixed_subs:
return {'en': [{'ext': 'srt', 'data': fixed_subs}]}
return {}
def _fix_subtitles(self, lines):
srt = ''
seq_counter = 0
for pos in range(0, len(lines) - 1):
seq_current = lines[pos]
seq_next = lines[pos + 1]
appear_time = self._timecode(seq_current['transcriptStartAt'])
disappear_time = self._timecode(seq_next['transcriptStartAt'])
text = seq_current['caption'].strip()
if text:
seq_counter += 1
srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (seq_counter, appear_time, disappear_time, text)
return srt
def _timecode(self, ms):
return srt_subtitles_timecode(ms / 1000.0)
class LinkedInLearningCourseIE(LinkedInLearningBaseIE): class LinkedInLearningCourseIE(LinkedInLearningBaseIE):
IE_NAME = 'linkedin:learning:course' IE_NAME = 'linkedin:learning:course'