From a9dcb076cbc15cd4acace860465f457737a053a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Str=C3=B6m?= Date: Mon, 21 Jan 2019 16:30:59 +0100 Subject: [PATCH] [expressen] make extractor work with more URLs --- youtube_dl/extractor/expressen.py | 75 +++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/expressen.py b/youtube_dl/extractor/expressen.py index dc8b855d2..1fc95d487 100644 --- a/youtube_dl/extractor/expressen.py +++ b/youtube_dl/extractor/expressen.py @@ -4,14 +4,17 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, +) from ..utils import ( determine_ext, int_or_none, unescapeHTML, unified_timestamp, + ExtractorError, ) - class ExpressenIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// @@ -44,6 +47,25 @@ class ExpressenIE(InfoExtractor): 'only_matching': True, }, { 'url': 'https://www.di.se/videoplayer/embed/tv/ditv/borsmorgon/implantica-rusar-70--under-borspremiaren-hor-styrelsemedlemmen/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di', + }, { + 'url': 'https://www.expressen.se/tv/livsstil/halsoliv/5-harliga-fakta-om-kaffe/', + 'md5': '4d4572e17d2bec5fa2c144bb63857934', + 'info_dict': { + 'id': '8790448', + 'ext': 'mp4', + 'title': '5 härliga fakta om kaffe', + 'description': 'md5:b7faa986d02765cdd7ccaa0db4673258', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 56, + 'timestamp': 1531986096, + 'upload_date': '20180719' + + } + }, { + 'url': 'https://www.expressen.se/videoplayer/embed/tv/livsstil/halsoliv/5-harliga-fakta-om-kaffe', + 'only_matching': True, + }, { + 'url': 'https://www.expressen.se/tv/livsstil/halsoliv/har-gifter-de-sig-pa-10-000-meters-hojd/', 'only_matching': True, }] @@ -57,7 +79,10 @@ class ExpressenIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + original_url = url + embed_url = url.replace('expressen.se/tv/', 'expressen.se/videoplayer/embed/tv/') + + urls = (embed_url, original_url) def extract_data(name): return self._parse_json( @@ -66,20 +91,42 @@ class ExpressenIE(InfoExtractor): webpage, 'info', group='value'), display_id, transform_source=unescapeHTML) - info = extract_data('video-tracking-info') - video_id = info['videoId'] + for url in urls: + last = url == urls[len(urls) - 1] - data = extract_data('article-data') - stream = data['stream'] + try: + webpage = self._download_webpage(url, display_id) + + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if not last: + continue + raise ExtractorError('Video not found', expected=True) + + try: + info = extract_data('video-tracking-info') + video_id = info['videoId'] + + data = extract_data('article-data') + stream = data['stream'] + + if determine_ext(stream) == 'm3u8': + formats = self._extract_m3u8_formats( + stream, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + else: + formats = [{ + 'url': stream, + }] + + except (ExtractorError, KeyError) as e: + formats = None + + if formats: + break + elif last: + raise ExtractorError('Video not found', expected=True) - if determine_ext(stream) == 'm3u8': - formats = self._extract_m3u8_formats( - stream, display_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - else: - formats = [{ - 'url': stream, - }] self._sort_formats(formats) title = info.get('titleRaw') or data['title']