From 8b8c1093b65ee02aad859ed8d82217312ed0d9d8 Mon Sep 17 00:00:00 2001 From: Shaun Walbridge Date: Sat, 18 Apr 2015 00:37:04 -0400 Subject: [PATCH] [EsriVideo] Add new extractor Add extractor for [videos.esri.com](https://videos.esri.com), a collection of videos relating to GIS. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/videoesri.py | 90 +++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 youtube_dl/extractor/videoesri.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 83d21bd15..a4387636f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -695,6 +695,7 @@ from .vice import ViceIE from .viddler import ViddlerIE from .videobam import VideoBamIE from .videodetective import VideoDetectiveIE +from .videoesri import VideoEsriIE from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videomega import VideoMegaIE diff --git a/youtube_dl/extractor/videoesri.py b/youtube_dl/extractor/videoesri.py new file mode 100644 index 000000000..0f84323a4 --- /dev/null +++ b/youtube_dl/extractor/videoesri.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os +import re + +from .common import InfoExtractor + +from ..utils import ( + unified_strdate +) + + +class VideoEsriIE(InfoExtractor): + _VALID_URL = r'https?://video\.esri\.com/watch/(?P[0-9]+)' + _TEST = { + 'url': 'https://video.esri.com/watch/4228', + 'md5': '170b4d513c2466ed483c150a48384133', + 'info_dict': { + 'id': '4228', + 'ext': 'mp4', + 'title': 'AppStudio for ArcGIS', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20150310', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'

(.*?)

', webpage, 'title') + + upload_date_raw = self._search_regex( + r'http-equiv="last-modified" content="(.*)"', + webpage, 'upload date') + upload_date = unified_strdate(upload_date_raw) + + settings_info = self._search_regex( + r'evPlayerSettings = {(.*?);\s*$', + webpage, 'settings info', flags=re.MULTILINE | re.DOTALL) + + # thumbnail includes '_x' for large, also has {_m,_t,_s} or + # without size suffix returns full image + thumbnail_path = re.findall( + r'image\': \'(\/thumbs.*)\'', + settings_info)[0] + + if thumbnail_path: + thumbnail = '/'.join(['http://video.esri.com', thumbnail_path]) + + # note that this misses the (exceedly rare) webm files + video_paths = re.findall(r'mp4:(.*)\'', settings_info) + + # find possible http servers of the mp4 files (also has rtsp) + base_url = re.findall( + r'netstreambasepath\':\s\'(h.*)\'', settings_info)[0] + + # these are the numbers used internally, but really map + # to other resolutions, e.g. 960 is 720p. + heights = [480, 720, 960] + videos_by_res = {} + for video_path in video_paths: + url = "{base_url}{video_path}".format( + base_url=base_url, + video_path=video_path) + filename, ext = os.path.splitext(video_path) + height_label = int(filename.split('_')[1]) + videos_by_res[height_label] = { + 'url': url, + 'ext': ext[1:], + 'protocol': 'http', # http-only supported currently + } + + formats = [] + for height in heights: + if height in videos_by_res: + formats.append(videos_by_res[height]) + + result = { + 'id': video_id, + 'title': title, + 'upload_date': upload_date, + 'formats': formats, + } + + if thumbnail: + result['thumbnail'] = thumbnail + + return result