From a3513de52cd3f5de7e8282e2e4ab32277e9d7856 Mon Sep 17 00:00:00 2001 From: Alcaro Date: Sun, 3 Jul 2016 00:32:39 +0200 Subject: [PATCH 1/6] Add mp4upload extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/mp4upload.py | 105 +++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 youtube_dl/extractor/mp4upload.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5dab055db..11e1f27b3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -455,6 +455,7 @@ from .mixcloud import ( ) from .mlb import MLBIE from .mnet import MnetIE +from .mp4upload import Mp4UploadIE from .mpora import MporaIE from .moevideo import MoeVideoIE from .mofosex import MofosexIE diff --git a/youtube_dl/extractor/mp4upload.py b/youtube_dl/extractor/mp4upload.py new file mode 100644 index 000000000..e5132abfd --- /dev/null +++ b/youtube_dl/extractor/mp4upload.py @@ -0,0 +1,105 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +import re +import time + + +class Mp4UploadIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mp4upload\.com/(?P.+)' + _TEST = { + 'url': 'http://www.mp4upload.com/e52ycvdl4x29', + 'md5': '09780a74b0de79ada5f9a8955f0704fc', + + 'info_dict': { + 'id': 'e52ycvdl4x29', + 'ext': 'mp4', + 'title': '橋本潮 - ロマンティックあげるよ.mp4', + 'timestamp': 1467471956, + 'thumbnail': 'http://www3.mp4upload.com/i/00283/e52ycvdl4x29.jpg', + + 'vcodec': 'ffh264', + 'width': 454, + 'height': 360, + 'fps': 29.970, + + 'acodec': 'ffaac', + 'asr': 44100, + 'abr': 96, + + # Something adds this to the _real_extract return value, and the test runner expects it present. + # Should probably be autocalculated from the timestamp instead, just like _real_extract. + 'upload_date': '20160702', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + embedpage = self._download_webpage("http://www.mp4upload.com/embed-" + video_id + ".html", video_id) + + title = self._html_search_regex(r'(.*?)', webpage, 'title') + url = self._html_search_regex(r'"file": "([^"]+)",', embedpage, 'url') + thumbnail = self._html_search_regex(r'"image": "([^"]+)",', embedpage, 'url', fatal=False) + + acodec = None + asr = None + abr = None + audio_raw = self._html_search_regex( + r'
  • Audio info:(.+?)
  • ', + webpage, 'audioinfo', fatal=False) + if audio_raw: + audmatch = re.search(r'(.+?), ([0-9]+) kbps, ([0-9]+) Hz', audio_raw) + if audmatch: + (acodec, abr, asr) = audmatch.groups() + + # can't use _html_search_regex, there's data both inside and outside a bold tag and I need it all + timestamp = None + date_raw = self._search_regex(r'Uploaded on(.+?)', webpage, 'timestamp', fatal=False, flags=re.DOTALL) + if date_raw: + date_raw = re.sub(r"<[^>]+>", "", date_raw) + date_raw = re.sub(r"[\s]+", " ", date_raw) + timestamp = time.mktime(time.strptime(date_raw, " %Y-%m-%d %H:%M:%S ")) + + width = None + height = None + resolution_raw = self._html_search_regex( + r'
  • Resolution:(.+?)
  • ', + webpage, 'resolution', fatal=False) + if resolution_raw: + resmatch = re.search(r'([0-9]+) x ([0-9]+)', resolution_raw) + if resmatch: + (width, height) = resmatch.groups() + + vcodec = self._html_search_regex( + r'
  • Codec:(.+?)
  • ', + webpage, 'codec', fatal=False) + + fps = self._html_search_regex( + r'
  • Framerate:(.+?) fps
  • ', + webpage, 'framerate', fatal=False) + + filesize_approx = self._html_search_regex(r'Size\s+(.+?)', + webpage, 'filesize', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'formats': [{ + 'url': url, + 'filesize_approx': filesize_approx, + + 'vcodec': vcodec, + 'width': int(width), + 'height': int(height), + 'fps': float(fps), + + 'acodec': acodec, + 'asr': int(asr), + 'abr': int(abr), + }], + 'timestamp': timestamp, + 'thumbnail': thumbnail, + } From 1f0a59c684b1cda2815a5e51965c054be54725dc Mon Sep 17 00:00:00 2001 From: bato3 Date: Thu, 12 Jul 2018 23:38:17 +0200 Subject: [PATCH 2/6] change `regex` to `_extract_jwplayer_data` --- youtube_dl/extractor/mp4upload.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/mp4upload.py b/youtube_dl/extractor/mp4upload.py index 3a8fde02a..308d69ea0 100644 --- a/youtube_dl/extractor/mp4upload.py +++ b/youtube_dl/extractor/mp4upload.py @@ -73,19 +73,16 @@ class Mp4UploadIE(InfoExtractor): raise ExtractorError('I can\'t find file info', video_id=video_id) embedpage = self._download_webpage(embed_url, video_id, note='Downloading embed webpage') - # _find_jwplayer_data don't work - mobj = re.search( - r'player.setup\((?P{.+?})\);', - decode_packed_codes(get_element_by_id("player", embedpage)).replace("\\'", '"') - ) - if not mobj: - raise ExtractorError('I can\'t find player data', video_id=video_id) # It contains only `source url` and `thumbnail` - poor_info_dict = self._parse_jwplayer_data( - self._parse_json( - mobj.group('options'), video_id=video_id, transform_source=js_to_json - ), video_id, base_url=embed_url, require_title=False) + poor_info_dict = self._extract_jwplayer_data( + decode_packed_codes( + get_element_by_id("player", embedpage) + ).replace("\\'", '"'), + video_id, base_url=embed_url, require_title=False + ) + if not poor_info_dict: + raise ExtractorError('I can\'t find player data', video_id=video_id) info_dict['thumbnail'] = poor_info_dict.get('thumbnail') _f = { From 0f70ac8403425490c77ec0ff2281cb3ff8ca21d1 Mon Sep 17 00:00:00 2001 From: bato3 Date: Fri, 13 Jul 2018 01:02:28 +0200 Subject: [PATCH 3/6] single quotes --- youtube_dl/extractor/mp4upload.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/mp4upload.py b/youtube_dl/extractor/mp4upload.py index 308d69ea0..2bb985214 100644 --- a/youtube_dl/extractor/mp4upload.py +++ b/youtube_dl/extractor/mp4upload.py @@ -9,7 +9,6 @@ from ..utils import ( decode_packed_codes, get_element_by_class, get_element_by_id, - js_to_json, parse_filesize, strip_or_none, ) @@ -66,7 +65,7 @@ class Mp4UploadIE(InfoExtractor): } file_info = re.findall( - r'">(?P