1
0
mirror of https://codeberg.org/polarisfm/youtube-dl synced 2024-11-22 16:44:32 +01:00

[youtube] modify regex to get chapters from description (closes #24819)

This commit is contained in:
Jaime Bergas Laborda 2020-04-19 20:02:40 +01:00
parent 00eb865b3c
commit 78d7146748
2 changed files with 36 additions and 1 deletions

View File

@ -15,6 +15,41 @@ from youtube_dl.extractor import YoutubeIE
class TestYoutubeChapters(unittest.TestCase): class TestYoutubeChapters(unittest.TestCase):
_TEST_CASES = [ _TEST_CASES = [
(
# https://www.youtube.com/watch?v=gBRKnvK1JUE
# pattern: 00:00 - 09:24 <title>
'''Here is Nucleus's 1979 album Out Of The Long Dark: https://www.youtube.com/watch?v=GX4Eh1DPb-E<br /><br />And here is their 1971 live album: https://www.youtube.com/watch?v=cpbM75B8qaE<br /><br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+00);return false;">00:00</a> - <a href="#" onclick="yt.www.watch.player.seekTo(09*60+24);return false;">09:24</a> roots<br /><a href="#" onclick="yt.www.watch.player.seekTo(09*60+24);return false;">09:24</a> - <a href="#" onclick="yt.www.watch.player.seekTo(14*60+19);return false;">14:19</a> images<br /><a href="#" onclick="yt.www.watch.player.seekTo(14*60+19);return false;">14:19</a> - <a href="#" onclick="yt.www.watch.player.seekTo(18*60+20);return false;">18:20</a> caliban<br /><a href="#" onclick="yt.www.watch.player.seekTo(18*60+20);return false;">18:20</a> - <a href="#" onclick="yt.www.watch.player.seekTo(21*60+42);return false;">21:42</a> whapatiti<br /><a href="#" onclick="yt.www.watch.player.seekTo(21*60+42);return false;">21:42</a> - <a href="#" onclick="yt.www.watch.player.seekTo(26*60+18);return false;">26:18</a> capricorn<br /><a href="#" onclick="yt.www.watch.player.seekTo(26*60+18);return false;">26:18</a> - <a href="#" onclick="yt.www.watch.player.seekTo(29*60+42);return false;">29:42</a> odokamona<br /><a href="#" onclick="yt.www.watch.player.seekTo(29*60+42);return false;">29:42</a> - <a href="#" onclick="yt.www.watch.player.seekTo(37*60+26);return false;">37:26</a> southern roots and celebration<br /><br />Bass Guitar Roger Sutton<br />Design Keith Davis (3)<br />Drums Clive Thacker<br />Engineer Roger Wake<br />Guitar Jocelyn Pitchen<br />Percussion Aureo de Souza<br />Piano, Electric Piano Dave MacRae<br />Producer Fritz Fryer<br />Tenor Saxophone, Soprano Saxophone, Flute, Flute [Bamboo] Brian Smith<br />Trumpet Ian Carr<br />Vocals Joy Yates<br />Written-By Brian Smith (tracks: B1 to B3), Dave MacRae (tracks: B4), Ian Carr (tracks: A) ''',
2246,
[{
'start_time': 0,
'end_time': 564,
'title': 'roots',
}, {
'start_time': 564,
'end_time': 859,
'title': 'images',
}, {
'start_time': 859,
'end_time': 1100,
'title': 'caliban',
}, {
'start_time': 1100,
'end_time': 1302,
'title': 'whapatiti',
}, {
'start_time': 1302,
'end_time': 1578,
'title': 'capricorn',
}, {
'start_time': 1578,
'end_time': 1782,
'title': 'odokamona',
}, {
'start_time': 1782,
'end_time': 2246,
'title': 'southern roots and celebration',
}]
),
( (
# https://www.youtube.com/watch?v=A22oy8dFjqc # https://www.youtube.com/watch?v=A22oy8dFjqc
# pattern: 00:00 - <title> # pattern: 00:00 - <title>

View File

@ -1621,7 +1621,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if not description: if not description:
return None return None
chapter_lines = re.findall( chapter_lines = re.findall(
r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)', r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>(?:[^<]*<a.*?)?[^>]*)(?=$|<br\s*/>)',
description) description)
if not chapter_lines: if not chapter_lines:
return None return None