Moved to parsing json content and improved regex pattern

2024-11-29 19:47:54 +01:00 · 2020-05-14 01:19:58 +02:00 · 2020-05-14 01:19:58 +02:00 · c8dba62857
commit c8dba62857
parent 266714aeaf
1 changed files with 28 additions and 27 deletions
--- a/youtube_dl/extractor/itv.py
+++ b/youtube_dl/extractor/itv.py
@ -278,25 +278,16 @@ class ITVIE(InfoExtractor):


 class ITVBTCCIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(articles|races)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
-    _TESTS = [
-        {
+    _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TEST = {
        'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action',
        'info_dict': {
            'id': 'btcc-2019-brands-hatch-gp-race-action',
            'title': 'BTCC 2019: Brands Hatch GP race action',
        },
        'playlist_mincount': 12,
-        },
-        {
-            'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch',
-            'info_dict': {
-                'id': 'btcc-2018-all-the-action-from-brands-hatch',
-                'title': 'BTCC 2018: All the action from Brands Hatch',
-            },
-            'playlist_mincount': 9,
    }
-    ]
+
    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s'

    def _real_extract(self, url):
@ -304,18 +295,28 @@ class ITVBTCCIE(InfoExtractor):

        webpage = self._download_webpage(url, playlist_id)

+        json_map = self._html_search_regex(
+            '<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
+            webpage,
+            'json_map'
+        )
+
        entries = [
            self.url_result(
-                smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {
+                smuggle_url(
+                    self.BRIGHTCOVE_URL_TEMPLATE % video_id['data'].get('id'), {
                        # ITV does not like some GB IP ranges, so here are some
                        # IP blocks it accepts
                        'geo_ip_blocks': [
                            '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21'
                        ],
                        'referrer': url,
-                }),
+                    }
+                ),
                ie=BrightcoveNewIE.ie_key(), video_id=video_id)
-            for video_id in re.findall(r'["\']data["\']:{["\']id["\']:(\d+),', webpage)]
+            for video_id in self._parse_json(
+                json_map, playlist_id
+            )['props']['pageProps']['article']['body']['content']]

        title = self._og_search_title(webpage, fatal=False)