From 78d71467484aab86b46466e2ba76a0a289c15fe7 Mon Sep 17 00:00:00 2001 From: Jaime Bergas Laborda Date: Sun, 19 Apr 2020 20:02:40 +0100 Subject: [PATCH] [youtube] modify regex to get chapters from description (closes #24819) --- test/test_youtube_chapters.py | 35 +++++++++++++++++++++++++++++++++ youtube_dl/extractor/youtube.py | 2 +- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/test/test_youtube_chapters.py b/test/test_youtube_chapters.py index 324ca8525..63a9a88c1 100644 --- a/test/test_youtube_chapters.py +++ b/test/test_youtube_chapters.py @@ -15,6 +15,41 @@ from youtube_dl.extractor import YoutubeIE class TestYoutubeChapters(unittest.TestCase): _TEST_CASES = [ + ( + # https://www.youtube.com/watch?v=gBRKnvK1JUE + # pattern: 00:00 - 09:24 + '''Here is Nucleus's 1979 album Out Of The Long Dark: https://www.youtube.com/watch?v=GX4Eh1DPb-E<br /><br />And here is their 1971 live album: https://www.youtube.com/watch?v=cpbM75B8qaE<br /><br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+00);return false;">00:00</a> - <a href="#" onclick="yt.www.watch.player.seekTo(09*60+24);return false;">09:24</a> roots<br /><a href="#" onclick="yt.www.watch.player.seekTo(09*60+24);return false;">09:24</a> - <a href="#" onclick="yt.www.watch.player.seekTo(14*60+19);return false;">14:19</a> images<br /><a href="#" onclick="yt.www.watch.player.seekTo(14*60+19);return false;">14:19</a> - <a href="#" onclick="yt.www.watch.player.seekTo(18*60+20);return false;">18:20</a> caliban<br /><a href="#" onclick="yt.www.watch.player.seekTo(18*60+20);return false;">18:20</a> - <a href="#" onclick="yt.www.watch.player.seekTo(21*60+42);return false;">21:42</a> whapatiti<br /><a href="#" onclick="yt.www.watch.player.seekTo(21*60+42);return false;">21:42</a> - <a href="#" onclick="yt.www.watch.player.seekTo(26*60+18);return false;">26:18</a> capricorn<br /><a href="#" onclick="yt.www.watch.player.seekTo(26*60+18);return false;">26:18</a> - <a href="#" onclick="yt.www.watch.player.seekTo(29*60+42);return false;">29:42</a> odokamona<br /><a href="#" onclick="yt.www.watch.player.seekTo(29*60+42);return false;">29:42</a> - <a href="#" onclick="yt.www.watch.player.seekTo(37*60+26);return false;">37:26</a> southern roots and celebration<br /><br />Bass Guitar – Roger Sutton<br />Design – Keith Davis (3)<br />Drums – Clive Thacker<br />Engineer – Roger Wake<br />Guitar – Jocelyn Pitchen<br />Percussion – Aureo de Souza<br />Piano, Electric Piano – Dave MacRae<br />Producer – Fritz Fryer<br />Tenor Saxophone, Soprano Saxophone, Flute, Flute [Bamboo] – Brian Smith<br />Trumpet – Ian Carr<br />Vocals – Joy Yates<br />Written-By – Brian Smith (tracks: B1 to B3), Dave MacRae (tracks: B4), Ian Carr (tracks: A) ''', + 2246, + [{ + 'start_time': 0, + 'end_time': 564, + 'title': 'roots', + }, { + 'start_time': 564, + 'end_time': 859, + 'title': 'images', + }, { + 'start_time': 859, + 'end_time': 1100, + 'title': 'caliban', + }, { + 'start_time': 1100, + 'end_time': 1302, + 'title': 'whapatiti', + }, { + 'start_time': 1302, + 'end_time': 1578, + 'title': 'capricorn', + }, { + 'start_time': 1578, + 'end_time': 1782, + 'title': 'odokamona', + }, { + 'start_time': 1782, + 'end_time': 2246, + 'title': 'southern roots and celebration', + }] + ), ( # https://www.youtube.com/watch?v=A22oy8dFjqc # pattern: 00:00 - <title> diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index afaa12b1b..7c38531bf 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1621,7 +1621,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not description: return None chapter_lines = re.findall( - r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)', + r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>(?:[^<]*<a.*?)?[^>]*)(?=$|<br\s*/>)', description) if not chapter_lines: return None