2019-03-08 06:37:22 +01:00
|
|
|
# coding: utf-8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
from .common import InfoExtractor
|
|
|
|
from ..utils import (
|
|
|
|
RegexNotFoundError,
|
|
|
|
url_or_none
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
class YleAreenaIE(InfoExtractor):
|
|
|
|
_VALID_URL = r'https?://(?:areena|arenan).yle.fi/(?P<id>[0-9]-[0-9]+)'
|
|
|
|
_GEO_COUNTRIES = ['FI']
|
|
|
|
|
|
|
|
_TEST = {
|
|
|
|
'url': 'https://areena.yle.fi/1-4256816',
|
|
|
|
'md5': 'b9658c5960a8c2ca4ba8f1b0ff079df2',
|
|
|
|
'info_dict': {
|
|
|
|
'id': '1_iq074q8b',
|
|
|
|
'ext': 'mxf',
|
|
|
|
'title': 'Luottomies | Luottomies jouluspeciaali',
|
2019-03-08 16:12:11 +01:00
|
|
|
'description':
|
2019-03-08 18:44:34 +01:00
|
|
|
'Tommia harmittaa kun sukulaiset ovat tulossa pilaamaan '
|
2019-03-08 16:12:11 +01:00
|
|
|
'mukavan perhejoulun. Muuttuuko mieli isosta yllätyksestä? '
|
|
|
|
'Joulun erikoisjakson on ohjannut Jalmari Helander.',
|
2019-03-08 06:37:22 +01:00
|
|
|
'upload_date': '20171207',
|
|
|
|
'height': 1080,
|
|
|
|
'width': 1920,
|
|
|
|
'fps': 25,
|
|
|
|
'duration': 1302,
|
|
|
|
'timestamp': 1512633989,
|
|
|
|
'extractor': 'Kaltura',
|
|
|
|
'uploader_id': 'ovp@yle.fi',
|
|
|
|
'webpage_url_basename': '1-4256816',
|
|
|
|
'webpage_url': 'https://areena.yle.fi/1-4256816'
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
def _real_extract(self, url):
|
2019-03-08 16:12:11 +01:00
|
|
|
# This extractor will fetch some basic info and then lead to Kaltura
|
|
|
|
# extractor.
|
|
|
|
props = {
|
|
|
|
'_type': 'url_transparent',
|
|
|
|
'ie_key': 'Kaltura'
|
|
|
|
}
|
|
|
|
|
2019-03-08 06:37:22 +01:00
|
|
|
# Get essential data
|
2019-03-08 16:12:11 +01:00
|
|
|
props['id'] = self._match_id(url)
|
|
|
|
webpage = self._download_webpage(url, props['id'])
|
|
|
|
|
|
|
|
# Try to extract title from OpenGraph metadata
|
|
|
|
_title = self._og_search_title(webpage, fatal=False)
|
|
|
|
|
|
|
|
# Fallback #1: try to extract title from page body
|
|
|
|
if _title is None:
|
|
|
|
_title = self._html_search_regex(
|
|
|
|
r'<h1>([^<]+)',
|
|
|
|
webpage,
|
|
|
|
'title',
|
|
|
|
fatal=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# Fallback #2: let Kaltura extractor give the title (it should have it)
|
|
|
|
# If title is found from Areena page, use it
|
|
|
|
if _title is not None:
|
|
|
|
props['title'] = _title
|
|
|
|
|
|
|
|
# Same thing for description
|
|
|
|
_description = self._og_search_description(webpage)
|
2019-03-08 06:37:22 +01:00
|
|
|
|
2019-03-08 16:12:11 +01:00
|
|
|
# No Areena fallback here, the page layout is so ambiguous we cannot
|
|
|
|
# guarantee that the right description would match in series pages
|
|
|
|
if _description is not None:
|
|
|
|
props['description'] = _description
|
2019-03-08 06:37:22 +01:00
|
|
|
|
2019-03-08 16:12:11 +01:00
|
|
|
# player_url is used for getting partner_id and entry_id for Kaltura
|
|
|
|
# extractor
|
2019-03-08 06:37:22 +01:00
|
|
|
try:
|
|
|
|
player_url = url_or_none(
|
|
|
|
self._og_search_property('video:secure_url', webpage)
|
|
|
|
)
|
|
|
|
except RegexNotFoundError:
|
|
|
|
player_url = None
|
|
|
|
|
|
|
|
# If this backup fails extractor will error out
|
|
|
|
player_url = url_or_none(
|
|
|
|
self._og_search_property('video:url', webpage)
|
|
|
|
)
|
|
|
|
|
|
|
|
if player_url is None:
|
|
|
|
raise RegexNotFoundError('Cannot find player url')
|
|
|
|
|
|
|
|
# Get Kaltura identifiers from player_url
|
|
|
|
partner_id = self._search_regex(
|
|
|
|
r'/p/([0-9]+)',
|
|
|
|
player_url,
|
|
|
|
'Kaltura partner id'
|
|
|
|
)
|
|
|
|
|
|
|
|
entry_id = self._search_regex(
|
|
|
|
r'/entry_id/([0-9]_[0-9a-z]+)',
|
|
|
|
player_url,
|
|
|
|
'Kaltura entry id'
|
|
|
|
)
|
|
|
|
|
2019-03-08 16:12:11 +01:00
|
|
|
props['url'] = 'kaltura:%s:%s' % (partner_id, entry_id)
|
2019-03-08 06:37:22 +01:00
|
|
|
|
2019-03-08 16:12:11 +01:00
|
|
|
return props
|