[zdf/common] Use API in ZDF extractor.

This also comes with a lot of extra format fields
Fixes #1518
This commit is contained in:
Philipp Hagemeister 2013-11-25 03:12:26 +01:00
parent 1fb2bcbbf7
commit 02dbf93f0e
5 changed files with 112 additions and 70 deletions

View File

@ -1,4 +1,3 @@
import math
import os import os
import re import re
import subprocess import subprocess
@ -11,6 +10,7 @@ from .utils import (
ContentTooShortError, ContentTooShortError,
determine_ext, determine_ext,
encodeFilename, encodeFilename,
format_bytes,
sanitize_open, sanitize_open,
timeconvert, timeconvert,
) )
@ -53,20 +53,6 @@ class FileDownloader(object):
self._progress_hooks = [] self._progress_hooks = []
self.params = params self.params = params
@staticmethod
def format_bytes(bytes):
if bytes is None:
return 'N/A'
if type(bytes) is str:
bytes = float(bytes)
if bytes == 0.0:
exponent = 0
else:
exponent = int(math.log(bytes, 1024.0))
suffix = ['B','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][exponent]
converted = float(bytes) / float(1024 ** exponent)
return '%.2f%s' % (converted, suffix)
@staticmethod @staticmethod
def format_seconds(seconds): def format_seconds(seconds):
(mins, secs) = divmod(seconds, 60) (mins, secs) = divmod(seconds, 60)
@ -117,7 +103,7 @@ class FileDownloader(object):
def format_speed(speed): def format_speed(speed):
if speed is None: if speed is None:
return '%10s' % '---b/s' return '%10s' % '---b/s'
return '%10s' % ('%s/s' % FileDownloader.format_bytes(speed)) return '%10s' % ('%s/s' % format_bytes(speed))
@staticmethod @staticmethod
def best_block_size(elapsed_time, bytes): def best_block_size(elapsed_time, bytes):
@ -525,7 +511,7 @@ class FileDownloader(object):
self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
return False return False
data_len_str = self.format_bytes(data_len) data_len_str = format_bytes(data_len)
byte_counter = 0 + resume_len byte_counter = 0 + resume_len
block_size = self.params.get('buffersize', 1024) block_size = self.params.get('buffersize', 1024)
start = time.time() start = time.time()

View File

@ -30,6 +30,7 @@ from .utils import (
DownloadError, DownloadError,
encodeFilename, encodeFilename,
ExtractorError, ExtractorError,
format_bytes,
locked_file, locked_file,
MaxDownloadsReached, MaxDownloadsReached,
PostProcessingError, PostProcessingError,
@ -867,9 +868,11 @@ class YoutubeDL(object):
def list_formats(self, info_dict): def list_formats(self, info_dict):
def format_note(fdict): def format_note(fdict):
if fdict.get('format_note') is not None:
return fdict['format_note']
res = u'' res = u''
if fdict.get('format_note') is not None:
res += fdict['format_note'] + u' '
if fdict.get('quality_name') is not None:
res += u'%s ' % fdict['quality_name']
if fdict.get('vcodec') is not None: if fdict.get('vcodec') is not None:
res += u'%-5s' % fdict['vcodec'] res += u'%-5s' % fdict['vcodec']
elif fdict.get('vbr') is not None: elif fdict.get('vbr') is not None:
@ -886,25 +889,30 @@ class YoutubeDL(object):
res += 'audio' res += 'audio'
if fdict.get('abr') is not None: if fdict.get('abr') is not None:
res += u'@%3dk' % fdict['abr'] res += u'@%3dk' % fdict['abr']
if fdict.get('filesize') is not None:
if res:
res += u', '
res += format_bytes(fdict['filesize'])
return res return res
def line(format): def line(format, idlen=20):
return (u'%-20s%-10s%-12s%s' % ( return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % (
format['format_id'], format['format_id'],
format['ext'], format['ext'],
self.format_resolution(format), self.format_resolution(format),
format_note(format), format_note(format),
) ))
)
formats = info_dict.get('formats', [info_dict]) formats = info_dict.get('formats', [info_dict])
formats_s = list(map(line, formats)) idlen = max(len(u'format code'),
max(len(f['format_id']) for f in formats))
formats_s = [line(f, idlen) for f in formats]
if len(formats) > 1: if len(formats) > 1:
formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)' formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)' formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
header_line = line({ header_line = line({
'format_id': u'format code', 'ext': u'extension', 'format_id': u'format code', 'ext': u'extension',
'_resolution': u'resolution', 'format_note': u'note'}) '_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen)
self.to_screen(u'[info] Available formats for %s:\n%s\n%s' % self.to_screen(u'[info] Available formats for %s:\n%s\n%s' %
(info_dict['id'], header_line, u"\n".join(formats_s))) (info_dict['id'], header_line, u"\n".join(formats_s)))

View File

@ -76,6 +76,8 @@ class InfoExtractor(object):
* acodec Name of the audio codec in use * acodec Name of the audio codec in use
* vbr Average video bitrate in KBit/s * vbr Average video bitrate in KBit/s
* vcodec Name of the video codec in use * vcodec Name of the video codec in use
* quality_name Human-readable name of the video quality.
* filesize The number of bytes, if known in advance
webpage_url: The url to the video webpage, if given to youtube-dl it webpage_url: The url to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set should allow to get the same result again. (It will be set
by YoutubeDL if it's missing) by YoutubeDL if it's missing)

View File

@ -1,75 +1,100 @@
import operator
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
determine_ext, parse_xml_doc,
ExtractorError, unified_strdate,
) )
class ZDFIE(InfoExtractor): class ZDFIE(InfoExtractor):
_VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
_MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('video_id') video_id = mobj.group('video_id')
if mobj.group('hash'): xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
url = url.replace(u'#', u'', 1) info_xml = self._download_webpage(
xml_url, video_id, note=u'Downloading video info')
doc = parse_xml_doc(info_xml)
html = self._download_webpage(url, video_id) title = doc.find('.//information/title').text
streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] description = doc.find('.//information/detail').text
if streams is None: uploader_node = doc.find('.//details/originChannelTitle')
raise ExtractorError(u'No media url found.') uploader = None if uploader_node is None else uploader_node.text
duration_str = doc.find('.//details/length').text
duration_m = re.match(r'''(?x)^
(?P<hours>[0-9]{2})
:(?P<minutes>[0-9]{2})
:(?P<seconds>[0-9]{2})
(?:\.(?P<ms>[0-9]+)?)
''', duration_str)
duration = (
(
(int(duration_m.group('hours')) * 60 * 60) +
(int(duration_m.group('minutes')) * 60) +
int(duration_m.group('seconds'))
)
if duration_m
else None
)
upload_date = unified_strdate(doc.find('.//details/airtime').text)
# s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url def xml_to_format(fnode):
# s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url video_url = fnode.find('url').text
# choose first/default media type and highest quality for now is_available = u'http://www.metafilegenerator' not in video_url
def stream_pref(s):
TYPE_ORDER = ['ostreaming', 'hstreaming', 'wstreaming'] format_id = fnode.attrib['basetype']
format_m = re.match(r'''(?x)
(?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
(?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
''', format_id)
PROTO_ORDER = ['http', 'rtmp', 'rtsp']
try: try:
type_pref = TYPE_ORDER.index(s['media_type']) proto_pref = -PROTO_ORDER.index(format_m.group('proto'))
except ValueError: except ValueError:
type_pref = 999 proto_pref = 999
QUALITY_ORDER = ['veryhigh', '300'] quality = fnode.find('./quality').text
QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low']
try: try:
quality_pref = QUALITY_ORDER.index(s['quality']) quality_pref = -QUALITY_ORDER.index(quality)
except ValueError: except ValueError:
quality_pref = 999 quality_pref = 999
return (type_pref, quality_pref) abr = int(fnode.find('./audioBitrate').text) // 1000
vbr = int(fnode.find('./videoBitrate').text) // 1000
pref = (is_available, proto_pref, quality_pref, vbr, abr)
sorted_streams = sorted(streams, key=stream_pref) return {
if not sorted_streams: 'format_id': format_id,
raise ExtractorError(u'No stream found.') 'url': video_url,
stream = sorted_streams[0] 'ext': format_m.group('container'),
'acodec': format_m.group('acodec'),
'vcodec': format_m.group('vcodec'),
'abr': abr,
'vbr': vbr,
'width': int(fnode.find('./width').text),
'height': int(fnode.find('./height').text),
'quality_name': quality,
'filesize': int(fnode.find('./filesize').text),
'format_note': None if is_available else u'(unavailable)',
'_pref': pref,
}
media_link = self._download_webpage( format_nodes = doc.findall('.//formitaeten/formitaet')
stream['video_url'], formats = sorted(map(xml_to_format, format_nodes),
video_id, key=operator.itemgetter('_pref'))
u'Get stream URL')
#MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
mobj = re.search(self._MEDIA_STREAM, media_link)
if mobj is None:
mobj = re.search(RTSP_STREAM, media_link)
if mobj is None:
raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
video_url = mobj.group('video_url')
title = self._html_search_regex(
r'<h1(?: class="beitragHeadline")?>(.*?)</h1>',
html, u'title')
return { return {
'id': video_id, 'id': video_id,
'url': video_url,
'title': title, 'title': title,
'ext': determine_ext(video_url) 'formats': formats,
'description': description,
'uploader': uploader,
'duration': duration,
'upload_date': upload_date,
} }

View File

@ -8,6 +8,7 @@ import gzip
import io import io
import json import json
import locale import locale
import math
import os import os
import pipes import pipes
import platform import platform
@ -16,6 +17,7 @@ import ssl
import socket import socket
import sys import sys
import traceback import traceback
import xml.etree.ElementTree
import zlib import zlib
try: try:
@ -1006,3 +1008,22 @@ def unsmuggle_url(smug_url):
jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0] jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
data = json.loads(jsond) data = json.loads(jsond)
return url, data return url, data
def parse_xml_doc(s):
assert isinstance(s, type(u''))
return xml.etree.ElementTree.fromstring(s.encode('utf-8'))
def format_bytes(bytes):
if bytes is None:
return u'N/A'
if type(bytes) is str:
bytes = float(bytes)
if bytes == 0.0:
exponent = 0
else:
exponent = int(math.log(bytes, 1024.0))
suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
converted = float(bytes) / float(1024 ** exponent)
return u'%.2f%s' % (converted, suffix)