1
0
mirror of https://codeberg.org/polarisfm/youtube-dl synced 2024-11-22 16:44:32 +01:00

[utils] add support for ttml styles

This commit is contained in:
Remita Amine 2017-02-23 18:46:20 +01:00
parent 75a2485407
commit 5b995f713b
2 changed files with 152 additions and 9 deletions

View File

@ -1069,6 +1069,47 @@ The first line
'''
self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data)
dfxp_data_with_style = '''<?xml version="1.0" encoding="utf-8"?>
<tt xmlns="http://www.w3.org/2006/10/ttaf1" xmlns:ttp="http://www.w3.org/2006/10/ttaf1#parameter" ttp:timeBase="media" xmlns:tts="http://www.w3.org/2006/10/ttaf1#style" xml:lang="en" xmlns:ttm="http://www.w3.org/2006/10/ttaf1#metadata">
<head>
<styling>
<style id="s2" style="s0" tts:color="cyan" tts:fontWeight="bold" />
<style id="s1" style="s0" tts:color="yellow" tts:fontStyle="italic" />
<style id="s3" style="s0" tts:color="lime" tts:textDecoration="underline" />
<style id="s0" tts:backgroundColor="black" tts:fontStyle="normal" tts:fontSize="16" tts:fontFamily="sansSerif" tts:color="white" />
</styling>
</head>
<body tts:textAlign="center" style="s0">
<div>
<p begin="00:00:02.08" id="p0" end="00:00:05.84">default style<span tts:color="red">custom style</span></p>
<p style="s2" begin="00:00:02.08" id="p0" end="00:00:05.84"><span tts:color="lime">part 1<br /></span><span tts:color="cyan">part 2</span></p>
<p style="s3" begin="00:00:05.84" id="p1" end="00:00:09.56">line 3<br />part 3</p>
<p style="s1" tts:textDecoration="underline" begin="00:00:09.56" id="p2" end="00:00:12.36"><span style="s2" tts:color="lime">inner<br /> </span>style</p>
</div>
</body>
</tt>'''
srt_data = '''1
00:00:02,080 --> 00:00:05,839
<font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font>
2
00:00:02,080 --> 00:00:05,839
<b><font color="cyan" face="sansSerif" size="16"><font color="lime">part 1
</font>part 2</font></b>
3
00:00:05,839 --> 00:00:09,560
<u><font color="lime">line 3
part 3</font></u>
4
00:00:09,560 --> 00:00:12,359
<i><u><font color="yellow"><font color="lime">inner
</font>style</font></u></i>
'''
self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data)
def test_cli_option(self):
self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128'])
self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), [])

View File

@ -2511,27 +2511,97 @@ def srt_subtitles_timecode(seconds):
def dfxp2srt(dfxp_data):
LEGACY_NAMESPACES = (
('http://www.w3.org/ns/ttml', [
'http://www.w3.org/2004/11/ttaf1',
'http://www.w3.org/2006/04/ttaf1',
'http://www.w3.org/2006/10/ttaf1',
]),
('http://www.w3.org/ns/ttml#styling', [
'http://www.w3.org/ns/ttml#style',
]),
)
SUPPORTED_STYLING = [
'color',
'fontFamily',
'fontSize',
'fontStyle',
'fontWeight',
'textDecoration'
]
_x = functools.partial(xpath_with_ns, ns_map={
'ttml': 'http://www.w3.org/ns/ttml',
'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
'tts': 'http://www.w3.org/ns/ttml#styling',
})
styles = {}
default_style = {}
class TTMLPElementParser(object):
out = ''
_out = ''
_unclosed_elements = []
_applied_styles = []
def start(self, tag, attrib):
if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
self.out += '\n'
if tag in (_x('ttml:br'), 'br'):
self._out += '\n'
else:
unclosed_elements = []
style = {}
element_style_id = attrib.get('style')
if default_style:
style.update(default_style)
if element_style_id:
style.update(styles.get(element_style_id, {}))
for prop in SUPPORTED_STYLING:
prop_val = attrib.get(_x('tts:' + prop))
if prop_val:
style[prop] = prop_val
if style:
font = ''
for k, v in sorted(style.items()):
if self._applied_styles and self._applied_styles[-1].get(k) == v:
continue
if k == 'color':
font += ' color="%s"' % v
elif k == 'fontSize':
font += ' size="%s"' % v
elif k == 'fontFamily':
font += ' face="%s"' % v
elif k == 'fontWeight' and v == 'bold':
self._out += '<b>'
unclosed_elements.append('b')
elif k == 'fontStyle' and v == 'italic':
self._out += '<i>'
unclosed_elements.append('i')
elif k == 'textDecoration' and v == 'underline':
self._out += '<u>'
unclosed_elements.append('u')
if font:
self._out += '<font' + font + '>'
unclosed_elements.append('font')
applied_style = {}
if self._applied_styles:
applied_style.update(self._applied_styles[-1])
applied_style.update(style)
self._applied_styles.append(applied_style)
self._unclosed_elements.append(unclosed_elements)
def end(self, tag):
pass
if tag not in (_x('ttml:br'), 'br'):
unclosed_elements = self._unclosed_elements.pop()
for element in reversed(unclosed_elements):
self._out += '</%s>' % element
if unclosed_elements and self._applied_styles:
self._applied_styles.pop()
def data(self, data):
self.out += data
self._out += data
def close(self):
return self.out.strip()
return self._out.strip()
def parse_node(node):
target = TTMLPElementParser()
@ -2539,13 +2609,45 @@ def dfxp2srt(dfxp_data):
parser.feed(xml.etree.ElementTree.tostring(node))
return parser.close()
for k, v in LEGACY_NAMESPACES:
for ns in v:
dfxp_data = dfxp_data.replace(ns, k)
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = []
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
if not paras:
raise ValueError('Invalid dfxp/TTML subtitle')
repeat = False
while True:
for style in dfxp.findall(_x('.//ttml:style')):
style_id = style.get('id')
parent_style_id = style.get('style')
if parent_style_id:
if parent_style_id not in styles:
repeat = True
continue
styles[style_id] = styles[parent_style_id].copy()
for prop in SUPPORTED_STYLING:
prop_val = style.get(_x('tts:' + prop))
if prop_val:
styles.setdefault(style_id, {})[prop] = prop_val
if repeat:
repeat = False
else:
break
for p in ('body', 'div'):
ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
if ele is None:
continue
style = styles.get(ele.get('style'))
if not style:
continue
default_style.update(style)
for para, index in zip(paras, itertools.count(1)):
begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
end_time = parse_dfxp_time_expr(para.attrib.get('end'))