From 266714aeafcdb25dab61af943c2c5d27fe6e19c9 Mon Sep 17 00:00:00 2001 From: Fran Hermoso Date: Tue, 12 May 2020 23:37:20 +0200 Subject: [PATCH 1/4] [ITVBTCC] Update playlist extractor (closes #25240) --- youtube_dl/extractor/itv.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index ad2f4eca5..3f11aae1c 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -278,15 +278,25 @@ class ITVIE(InfoExtractor): class ITVBTCCIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P[^/?#&]+)' - _TEST = { - 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch', - 'info_dict': { - 'id': 'btcc-2018-all-the-action-from-brands-hatch', - 'title': 'BTCC 2018: All the action from Brands Hatch', + _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(articles|races)/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [ + { + 'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action', + 'info_dict': { + 'id': 'btcc-2019-brands-hatch-gp-race-action', + 'title': 'BTCC 2019: Brands Hatch GP race action', + }, + 'playlist_mincount': 12, }, - 'playlist_mincount': 9, - } + { + 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch', + 'info_dict': { + 'id': 'btcc-2018-all-the-action-from-brands-hatch', + 'title': 'BTCC 2018: All the action from Brands Hatch', + }, + 'playlist_mincount': 9, + } + ] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' def _real_extract(self, url): @@ -305,7 +315,7 @@ class ITVBTCCIE(InfoExtractor): 'referrer': url, }), ie=BrightcoveNewIE.ie_key(), video_id=video_id) - for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)] + for video_id in re.findall(r'["\']data["\']:{["\']id["\']:(\d+),', webpage)] title = self._og_search_title(webpage, fatal=False) From c8dba62857723e7a7bb2bc49d7c391f1a886550d Mon Sep 17 00:00:00 2001 From: Fran Hermoso Date: Thu, 14 May 2020 01:19:58 +0200 Subject: [PATCH 2/4] Moved to parsing json content and improved regex pattern --- youtube_dl/extractor/itv.py | 55 +++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 3f11aae1c..759a45668 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -278,25 +278,16 @@ class ITVIE(InfoExtractor): class ITVBTCCIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(articles|races)/(?:[^/]+/)*(?P[^/?#&]+)' - _TESTS = [ - { - 'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action', - 'info_dict': { - 'id': 'btcc-2019-brands-hatch-gp-race-action', - 'title': 'BTCC 2019: Brands Hatch GP race action', - }, - 'playlist_mincount': 12, + _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P[^/?#&]+)' + _TEST = { + 'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action', + 'info_dict': { + 'id': 'btcc-2019-brands-hatch-gp-race-action', + 'title': 'BTCC 2019: Brands Hatch GP race action', }, - { - 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch', - 'info_dict': { - 'id': 'btcc-2018-all-the-action-from-brands-hatch', - 'title': 'BTCC 2018: All the action from Brands Hatch', - }, - 'playlist_mincount': 9, - } - ] + 'playlist_mincount': 12, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' def _real_extract(self, url): @@ -304,18 +295,28 @@ class ITVBTCCIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) + json_map = self._html_search_regex( + '', + webpage, + 'json_map' + ) + entries = [ self.url_result( - smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, { - # ITV does not like some GB IP ranges, so here are some - # IP blocks it accepts - 'geo_ip_blocks': [ - '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21' - ], - 'referrer': url, - }), + smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % video_id['data'].get('id'), { + # ITV does not like some GB IP ranges, so here are some + # IP blocks it accepts + 'geo_ip_blocks': [ + '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21' + ], + 'referrer': url, + } + ), ie=BrightcoveNewIE.ie_key(), video_id=video_id) - for video_id in re.findall(r'["\']data["\']:{["\']id["\']:(\d+),', webpage)] + for video_id in self._parse_json( + json_map, playlist_id + )['props']['pageProps']['article']['body']['content']] title = self._og_search_title(webpage, fatal=False) From 0381cbe57639629ddd0504248250818bf10424b7 Mon Sep 17 00:00:00 2001 From: Fran Hermoso Date: Thu, 14 May 2020 17:04:54 +0200 Subject: [PATCH 3/4] [ITVBTCC] Improve reliability --- youtube_dl/extractor/itv.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 759a45668..d55a0cbe8 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -20,6 +20,7 @@ from ..utils import ( merge_dicts, parse_duration, smuggle_url, + try_get, url_or_none, xpath_with_ns, xpath_element, @@ -285,7 +286,7 @@ class ITVBTCCIE(InfoExtractor): 'id': 'btcc-2019-brands-hatch-gp-race-action', 'title': 'BTCC 2019: Brands Hatch GP race action', }, - 'playlist_mincount': 12, + 'playlist_count': 12, } BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' @@ -295,28 +296,29 @@ class ITVBTCCIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - json_map = self._html_search_regex( - '', - webpage, - 'json_map' - ) + json_map = try_get(self._parse_json(self._html_search_regex( + '(?s)]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)', webpage, 'json_map'), playlist_id), + lambda x: x['props']['pageProps']['article']['body']['content']) or [] + + # Discard empty objects + video_ids = [] + for video in json_map: + if video['data'].get('id'): + video_ids.append(video['data']['id']) entries = [ self.url_result( smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % video_id['data'].get('id'), { + self.BRIGHTCOVE_URL_TEMPLATE % video_id, { # ITV does not like some GB IP ranges, so here are some # IP blocks it accepts 'geo_ip_blocks': [ '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21' ], 'referrer': url, - } - ), + }), ie=BrightcoveNewIE.ie_key(), video_id=video_id) - for video_id in self._parse_json( - json_map, playlist_id - )['props']['pageProps']['article']['body']['content']] + for video_id in video_ids] title = self._og_search_title(webpage, fatal=False) From 2f6d8ab91595bed8d485fdac02b84e4058099a68 Mon Sep 17 00:00:00 2001 From: Fran Hermoso Date: Thu, 14 May 2020 17:06:41 +0200 Subject: [PATCH 4/4] [ITVBTCC] Return code to its original identation --- youtube_dl/extractor/itv.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index d55a0cbe8..e4d60c670 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -308,15 +308,14 @@ class ITVBTCCIE(InfoExtractor): entries = [ self.url_result( - smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % video_id, { - # ITV does not like some GB IP ranges, so here are some - # IP blocks it accepts - 'geo_ip_blocks': [ - '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21' - ], - 'referrer': url, - }), + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, { + # ITV does not like some GB IP ranges, so here are some + # IP blocks it accepts + 'geo_ip_blocks': [ + '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21' + ], + 'referrer': url, + }), ie=BrightcoveNewIE.ie_key(), video_id=video_id) for video_id in video_ids]