Style changes and more fallbacks

Following the style guide a bit better
2024-11-22 16:44:32 +01:00 · 2019-03-08 17:12:11 +02:00 · 2019-03-08 17:12:11 +02:00 · 49ca4a7875
commit 49ca4a7875
parent a1eb5017b0
1 changed files with 41 additions and 18 deletions
--- a/youtube_dl/extractor/yleareena.py
+++ b/youtube_dl/extractor/yleareena.py
@ -19,7 +19,10 @@ class YleAreenaIE(InfoExtractor):
            'id': '1_iq074q8b',
            'ext': 'mxf',
            'title': 'Luottomies | Luottomies jouluspeciaali',
-            'description': u'Tommia harmittaa kun sukulaiset ovat tulossa pilaamaan mukavan perhejoulun. Muuttuuko mieli isosta yllätyksestä? Joulun erikoisjakson on ohjannut Jalmari Helander.',
+            'description':
                u'Tommia harmittaa kun sukulaiset ovat tulossa pilaamaan '
                'mukavan perhejoulun. Muuttuuko mieli isosta yllätyksestä? '
                'Joulun erikoisjakson on ohjannut Jalmari Helander.',
            'upload_date': '20171207',
            'height': 1080,
            'width': 1920,
@ -34,17 +37,44 @@ class YleAreenaIE(InfoExtractor):
    }
    def _real_extract(self, url):
        # This extractor will fetch some basic info and then lead to Kaltura
        # extractor.
        props = {
            '_type': 'url_transparent',
            'ie_key': 'Kaltura'
        }
        # Get essential data
-        video_id = self._match_id(url)
+        props['id'] = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(url, props['id'])
-        # Extract essential metadata from Areena webpage
+        # Try to extract title from OpenGraph metadata
-        title = self._og_search_title(webpage)
+        _title = self._og_search_title(webpage, fatal=False)
        description = self._og_search_description(webpage)
-        # player_url is not used for the actual extraction,
+        # Fallback #1: try to extract title from page body
-        # just for getting partner_id and entry_id for Kaltura extractor
+        if _title is None:
-        # (though it is still required or else the extraction will fail)
+            _title = self._html_search_regex(
                r'<h1>([^<]+)',
                webpage,
                'title',
                fatal=False
            )
        # Fallback #2: let Kaltura extractor give the title (it should have it)
        # If title is found from Areena page, use it
        if _title is not None:
            props['title'] = _title
        # Same thing for description
        _description = self._og_search_description(webpage)
        # No Areena fallback here, the page layout is so ambiguous we cannot
        # guarantee that the right description would match in series pages
        if _description is not None:
            props['description'] = _description
        # player_url is used for getting partner_id and entry_id for Kaltura
        # extractor
        try:
            player_url = url_or_none(
                self._og_search_property('video:secure_url', webpage)
@ -73,13 +103,6 @@ class YleAreenaIE(InfoExtractor):
            'Kaltura entry id'
        )
-        kaltura_url = 'kaltura:%s:%s' % (partner_id, entry_id)
+        props['url'] = 'kaltura:%s:%s' % (partner_id, entry_id)
-        return {
+        return props
            '_type': 'url_transparent',
            'id': video_id,
            'url': kaltura_url,
            'ie_key': 'Kaltura',
            'title': title,
            'description': description
        }