1
0
mirror of https://codeberg.org/polarisfm/youtube-dl synced 2024-11-26 02:14:32 +01:00

[CRTVG] Add new extractor

This commit is contained in:
m8factorial 2020-04-26 19:07:01 +02:00
parent 00eb865b3c
commit 8efa309b20
2 changed files with 156 additions and 0 deletions

View File

@ -0,0 +1,155 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
unescapeHTML,
unified_strdate,
)
class CRTVGIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?crtvg\.(?:gal|es)/(?P<category>rg/a-carta|rg/podcast|rg/destacados|tvg/a-carta|informativos|en-serie)/(?:[^/]+/)*(?P<slug>[A-Za-z0-9-]*)-(?P<id>[0-9]{4,})?'
_TESTS = [{
'url': 'http://www.crtvg.es/tvg/a-carta/pepe-o-ingles-28-best-of',
'info_dict': {
'id': '594037',
'ext': 'mp4',
'title': 'The Best of... (O mellor de Pepe o Inglés)',
'description': 'Lola conta como andan as cousas en Cruceiro un ano despois, con Pepe e Pilar por Londres e Filipa e o alcalde vivindo en Marbella e presenta o último episodio, un resume co mellor da serie, unha especie de propina como dia ela.',
'series': 'Pepe o inglés',
'release_date': '20130527',
},
}, {
'url': 'http://www.crtvg.es/rg/podcast/a-bola-extra-a-bola-extra-do-dia-21-04-2020-4383702',
'info_dict': {
'id': '4383702',
'ext': 'mp3',
'title': 'A bóla extra do día 21/04/2020',
'description': None,
'series': 'A bóla extra',
'release_date': '20200421',
},
}]
def _real_extract(self, url):
media_id = self._match_id(url)
category = re.match(self._VALID_URL, url).group('category')
webpage = self._download_webpage(url, media_id)
if media_id is None or media_id == 'None':
media_id = self._html_search_regex(
r'metadata\[\'ns_st_ci\'\][ ]*=[ ]*(\d{4,})',
webpage, 'media_id', group=1, fatal=False)
# Radio Galega in mp3, otherwise mp4
ext = 'mp3' if 'rg' in category else 'mp4'
media_url = self._html_search_regex(
r'https?://(?:www\.)?.*flumotion.com/videos/(?:[^/]+/)*?([A-Za-z0-9\-_]*)\.' + ext,
webpage, 'media_url', group=0)
if 'a-carta' in category or 'destacados' in category:
title = unescapeHTML(self._html_search_regex(
r'<h2 class="destacado-info-titulo-programa">(?:\s*)?([^\n\r]*)(?:\s*)?</h2>',
webpage, 'title', group=1, fatal=False))
description = unescapeHTML(self._html_search_regex(
r'<div class="entrada-contenido">(?:\s*)?([^\n\r]*)(?:\s*)?',
webpage, 'description', fatal=False))
series = unescapeHTML(self._html_search_regex(
r'<h2 class="decorativo">(?:\s*)?<a href="(?:[^"]*)?"(?:\s*)?title="([^"]*)?">',
webpage, 'series', group=1, fatal=False))
release_date = unescapeHTML(self._html_search_regex(
r'<div class="entrada-blog-fecha">(?:\s*)?(?:\D*)?(\d{2}/\d{2}/\d{4})',
webpage, 'release_date', group=1, fatal=False))
release_date = unified_strdate(release_date)
elif category == 'en-serie':
title = unescapeHTML(self._html_search_regex(
r'<div class="ficha">(?:\s*)?<h2>(.*)?</h2>',
webpage, 'title', group=1, fatal=False))
description = unescapeHTML(self._html_search_regex(
r'<div class="ficha">(?:\s*)?<h2>(?:.*)?</h2>(?:\s*)<p>(.*)?</p>',
webpage, 'description', fatal=False))
series = unescapeHTML(self._html_search_regex(
r'<div class="titulo-serie">(?:\s*)?<h1>(.*)?</h1>(?:\s*)',
webpage, 'series', group=1, flags=re.S, fatal=False))
release_date = unescapeHTML(self._html_search_regex(
r'metadata\[\'ns_st_ddt\'\][ ]*=[ ]*\'?(\d{4}\-\d{2}\-\d{2})',
webpage, 'release_date', group=1, fatal=False))
release_date = unified_strdate(release_date)
elif category == 'informativos':
title = unescapeHTML(self._html_search_regex(
r'<h3 class="entrada-titulo">(.*)?</h3>',
webpage, 'title', group=1, fatal=False))
description = unescapeHTML(self._html_search_meta(
('og:description', 'og:description'),
webpage, 'description', fatal=False))
series = 'Noticias de Galicia'
release_date = unescapeHTML(self._html_search_regex(
r'<div class="entrada-blog-fecha">(?:\s*)?(?:\D*)?(\d{2}/\d{2}/\d{4})',
webpage, 'release_date', group=1, fatal=False))
release_date = unified_strdate(release_date)
elif category == 'rg/podcast':
title = unescapeHTML(self._html_search_regex(
r'<title>(.*)?</title>',
webpage, 'series', group=1, fatal=False))
description = None
series = unescapeHTML(self._html_search_regex(
r'<div class="titulo">(?:\s*)?<a href="(?:[^"]*)?"(?:\s*)?title="([^"]*)?">',
webpage, 'series', group=1, fatal=False))
release_date = unescapeHTML(self._html_search_regex(
r'<title>(?:\D*)?(\d{2}/\d{2}/\d{4})',
webpage, 'release_date', group=1, fatal=False))
release_date = unified_strdate(release_date)
else:
title = unescapeHTML(self._html_search_meta(
('title', 'og:title'),
webpage, 'title', fatal=False))
if 'rg' not in category:
description = unescapeHTML(self._html_search_meta(
('og:description', 'og:description'),
webpage, 'description', fatal=False))
else:
description = None
series = unescapeHTML(self._html_search_regex(
r'<title>(.*)?</title>',
webpage, 'series', group=1, fatal=False))
release_date = None
if title is not None and "|" in title:
title = title.split("|")[0].rstrip()
if series is not None and "|" in series:
series = series.split("|")[0].rstrip()
formats = []
if 'rg' in category:
formats.append({
'format_id': 'audio',
'url': media_url,
'ext': ext,
'vcode': None,
'acodec': ext,
})
else:
formats = self._extract_m3u8_formats(
media_url + '/playlist.m3u8',
media_id, ext='mp4', fatal=False)
self._sort_formats(formats)
return {'id': media_id,
'title': title,
'description': description,
'series': series,
'release_date': release_date,
'formats': formats,
}

View File

@ -235,6 +235,7 @@ from .corus import CorusIE
from .cracked import CrackedIE from .cracked import CrackedIE
from .crackle import CrackleIE from .crackle import CrackleIE
from .crooksandliars import CrooksAndLiarsIE from .crooksandliars import CrooksAndLiarsIE
from .crtvg import CRTVGIE
from .crunchyroll import ( from .crunchyroll import (
CrunchyrollIE, CrunchyrollIE,
CrunchyrollShowPlaylistIE CrunchyrollShowPlaylistIE