From 7022e24b1dc8897cbbdd807b32fbf2691b7ecf44 Mon Sep 17 00:00:00 2001 From: Nehal Patel Date: Tue, 12 Jul 2016 19:08:03 -0500 Subject: [PATCH] [BrainPOP] Optimize regex and extractor, improve metadata, and add subscription video detection --- youtube_dl/extractor/brainpop.py | 51 ++++++++++++++++---------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/brainpop.py b/youtube_dl/extractor/brainpop.py index a930942b2..f3fc66ee1 100644 --- a/youtube_dl/extractor/brainpop.py +++ b/youtube_dl/extractor/brainpop.py @@ -2,42 +2,44 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + remove_end +) class BrainPOPIE(InfoExtractor): - _VALID_URL = r'https?:\/\/(?:(.+)\.)?brainpop\.com\/(?P[^\r\n]+)' + _VALID_URL = r'https?:\/\/(?:(.+)\.)?brainpop\.com\/[^/]+/[^/]+/(?P[^/?#&]+)' _TEST = { 'url': 'https://www.brainpop.com/english/freemovies/williamshakespeare/', 'md5': '676d936271b628dc05e4cec377751919', 'info_dict': { - 'id': 'english/freemovies/williamshakespeare/', + 'id': '3026', + 'display_id': 'williamshakespeare', 'ext': 'mp4', - 'title': 'William Shakespeare - BrainPOP', + 'title': 'William Shakespeare', 'thumbnail': 're:^https?://.*\.png$', 'description': 'He could do comedies, tragedies, histories and poetry. Learn about the greatest playwright in the history of the English language!', } } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - self.report_extraction(video_id) + content = self._parse_json(self._html_search_regex(r'var content = ([^;]*)', webpage, 'content'), display_id) + + if content['category']['unit']['topic']['free'] == 'no': + self.raise_login_required('%s is only available for users with Subscriptions' % display_id) + global_content = self._parse_json(self._html_search_regex(r'var global_content = ([^;]*)', webpage, 'global content').replace("'", '"'), display_id) + cdn_path = global_content.get('cdn_path', 'https://cdn.brainpop.com') + movie_cdn_path = global_content.get('movie_cdn_path', 'https://svideos.brainpop.com') ec_token = self._html_search_regex(r"ec_token : '([^']*)'", webpage, 'token') - settings = self._parse_json(self._html_search_regex(r'var settings = ([^;]*)', webpage, 'settings'), video_id) - title = settings['title'] - description = settings['description'] + screenshots = content['category']['unit']['topic'].get('screenshots', {}) + thumbnails = [{'url': cdn_path + screenshot} for screenshot in screenshots] - global_content = self._parse_json(self._html_search_regex(r'var global_content = ([^;]*)', webpage, 'global content').replace("'", '"'), video_id) - cdn_path = global_content['cdn_path'] - movie_cdn_path = global_content['movie_cdn_path'] - - content = self._parse_json(self._html_search_regex(r'var content = ([^;]*)', webpage, 'content'), video_id) movies = content['category']['unit']['topic']['movies'] - screenshots = content['category']['unit']['topic']['screenshots'] - formats = [] formats.append({ 'url': movie_cdn_path + movies['mp4'] + '?' + ec_token, @@ -50,17 +52,14 @@ class BrainPOPIE(InfoExtractor): 'width': 480, }) self._sort_formats(formats) - - thumbnails = [] - for (i, screenshot) in enumerate(screenshots): - thumbnails.append({ - 'url': cdn_path + screenshot, - }) + + settings = self._parse_json(self._html_search_regex(r'var settings = ([^;]*)', webpage, 'settings'), display_id) return { - 'id': video_id, - 'title': title, - 'formats': formats, + 'id': content['category']['unit']['topic']['EntryID'], + 'display_id': display_id, + 'title': remove_end(settings['title'], ' - BrainPOP'), + 'description': settings['description'], 'thumbnails': thumbnails, - 'description': description, + 'formats': formats, }