From 11565d1298ec000f69e2f8a735c21a161b19cf21 Mon Sep 17 00:00:00 2001 From: aapjez Date: Thu, 11 Jul 2019 00:59:53 +0200 Subject: [PATCH 1/6] [boundhub] Add extractor --- youtube_dl/extractor/boundhub.py | 76 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 77 insertions(+) create mode 100644 youtube_dl/extractor/boundhub.py diff --git a/youtube_dl/extractor/boundhub.py b/youtube_dl/extractor/boundhub.py new file mode 100644 index 000000000..90481223e --- /dev/null +++ b/youtube_dl/extractor/boundhub.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class BoundHubIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?boundhub\.com/videos/(?P[0-9]+)' + _TEST = { + 'url': 'https://www.boundhub.com/videos/205969/tamina-in-species-appropriate-cage-system-housing/', + 'md5': '6381e2491e6a42cc8e95c529b6da50a8', + 'info_dict': { + 'id': '205969', + 'title': 'Tamina in species-appropriate cage system housing', + 'description': 'Tamina in Straitjacket gagged an locked into a small cage for the afternoon.', + 'display_id': 'tamina-in-species-appropriate-cage-system-housing', + 'duration': 314, + 'ext': 'mp4', + 'thumbnail': 'https://cnt.boundhub.com/contents/videos_screenshots/205000/205969/preview.mp4.jpg', + 'uploader': 'Tamina', + 'uploader_id': 39278, + 'uploader_url': 'https://www.boundhub.com/members/39278/', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # Parse duration + duration_text = self._search_regex(r'\s*Duration:\s*([\w ]*)', webpage, 'duration_text', fatal=False) + minutes = self._html_search_regex(r'(\d*)min', duration_text, 'minutes', fatal=False) + seconds = self._html_search_regex(r'(\d*)sec', duration_text, 'seconds', fatal=False) + duration = (int(minutes) * 60) + int(seconds) + + # Parse views + views_text = self._search_regex(r'\s*Views:\s*([\w ]*)', webpage, 'views_text', fatal=False) + views = int_or_none(views_text.replace(' ', '')) + + # Get uploader url and id + uploader_url = self._search_regex(r'\s*([\s\S]+?)', webpage, 'html_screenshots', fatal=False) + regex_screenshots = r'' + thumbnails = list() + + for match in re.findall(regex_screenshots, html_screenshots): + img = dict() + img['url'] = match[0] + img['id'] = int_or_none(os.path.splitext(os.path.basename(img['url']))[0]) + img['width'] = int_or_none(match[1]) + img['height'] = int_or_none(match[2]) + thumbnails.append(img) + + return { + 'id': video_id, + 'title': self._search_regex(r'\s*

(.*)

', webpage, 'title', default=None) or self._og_search_title(webpage), + 'url': self._search_regex(r'video_url: [\"\']([^\"\']*)[\"\']', webpage, 'url'), + 'description': self._search_regex(r'\s*Description:\s*(.*)<\/em>', webpage, 'description', fatal=False), + 'display_id': self._html_search_regex(r'https?://(?:www\.)?boundhub\.com/videos/[0-9]+/([\w-]*)', url, 'display_id', fatal=False), + 'duration': duration, + 'ext': self._html_search_regex(r'postfix:\s*[\"\']\.([^\"\']*)[\"\']', webpage, 'ext', fatal=False), + 'thumbnail': self._html_search_regex(r'preview_url:\s*[\"\']([^\"\']*)[\"\']', webpage, 'thumbnail', fatal=False), + 'thumbnails': thumbnails, + 'uploader': self._search_regex(r'\s*\s*(.*)\s*
', webpage, 'uploader', fatal=False), + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + 'views': views, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 555fadfaf..9683c9bff 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -119,6 +119,7 @@ from .blinkx import BlinkxIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bostonglobe import BostonGlobeIE +from .boundhub import BoundHubIE from .bpb import BpbIE from .br import ( BRIE, From fdaae7c2a41c18631f196f446b92d27d3e00a310 Mon Sep 17 00:00:00 2001 From: aapjez Date: Fri, 2 Oct 2020 02:55:38 +0200 Subject: [PATCH 2/6] Get higher resolution thumbnails --- youtube_dl/extractor/boundhub.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/boundhub.py b/youtube_dl/extractor/boundhub.py index 90481223e..b489bae0d 100644 --- a/youtube_dl/extractor/boundhub.py +++ b/youtube_dl/extractor/boundhub.py @@ -48,15 +48,14 @@ class BoundHubIE(InfoExtractor): # Get screenshots html_screenshots = self._search_regex(r'([\s\S]+?)', webpage, 'html_screenshots', fatal=False) - regex_screenshots = r'' + regex_screenshots = r' Date: Fri, 2 Oct 2020 03:13:09 +0200 Subject: [PATCH 3/6] Reduce amount of variables --- youtube_dl/extractor/boundhub.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/boundhub.py b/youtube_dl/extractor/boundhub.py index b489bae0d..b675d1051 100644 --- a/youtube_dl/extractor/boundhub.py +++ b/youtube_dl/extractor/boundhub.py @@ -28,48 +28,36 @@ class BoundHubIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, self._match_id(url)) # Parse duration duration_text = self._search_regex(r'\s*Duration:\s*([\w ]*)', webpage, 'duration_text', fatal=False) minutes = self._html_search_regex(r'(\d*)min', duration_text, 'minutes', fatal=False) seconds = self._html_search_regex(r'(\d*)sec', duration_text, 'seconds', fatal=False) - duration = (int(minutes) * 60) + int(seconds) - # Parse views - views_text = self._search_regex(r'\s*Views:\s*([\w ]*)', webpage, 'views_text', fatal=False) - views = int_or_none(views_text.replace(' ', '')) - - # Get uploader url and id + # Get uploader url uploader_url = self._search_regex(r'\s*([\s\S]+?)', webpage, 'html_screenshots', fatal=False) - regex_screenshots = r'([\s\S]+?)', webpage, 'html_screenshots', fatal=False)): img = dict() img['url'] = match.rstrip('/') img['id'] = int_or_none(os.path.splitext(os.path.basename(img['url']))[0]) thumbnails.append(img) return { - 'id': video_id, + 'id': self._match_id(url), 'title': self._search_regex(r'\s*

(.*)

', webpage, 'title', default=None) or self._og_search_title(webpage), 'url': self._search_regex(r'video_url: [\"\']([^\"\']*)[\"\']', webpage, 'url'), 'description': self._search_regex(r'\s*Description:\s*(.*)<\/em>', webpage, 'description', fatal=False), 'display_id': self._html_search_regex(r'https?://(?:www\.)?boundhub\.com/videos/[0-9]+/([\w-]*)', url, 'display_id', fatal=False), - 'duration': duration, + 'duration': (int(minutes) * 60) + int(seconds), 'ext': self._html_search_regex(r'postfix:\s*[\"\']\.([^\"\']*)[\"\']', webpage, 'ext', fatal=False), 'thumbnail': self._html_search_regex(r'preview_url:\s*[\"\']([^\"\']*)[\"\']', webpage, 'thumbnail', fatal=False), 'thumbnails': thumbnails, 'uploader': self._search_regex(r'\s*\s*(.*)\s*
', webpage, 'uploader', fatal=False), - 'uploader_id': uploader_id, + 'uploader_id': int_or_none(self._html_search_regex(r'https?://(?:www\.)?boundhub\.com/members/(\d+)', uploader_url, 'uploader_id', fatal=False)), 'uploader_url': uploader_url, - 'views': views, + 'views': int_or_none(self._search_regex(r'\s*Views:\s*([\w ]*)', webpage, 'views_text', fatal=False).replace(' ', '')), } From 6a6a7307cf858513bcafde9cc5d050fdb45603d8 Mon Sep 17 00:00:00 2001 From: aapjez Date: Fri, 2 Oct 2020 03:16:54 +0200 Subject: [PATCH 4/6] Remove additional thumbnails --- youtube_dl/extractor/boundhub.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/youtube_dl/extractor/boundhub.py b/youtube_dl/extractor/boundhub.py index b675d1051..3fa791e47 100644 --- a/youtube_dl/extractor/boundhub.py +++ b/youtube_dl/extractor/boundhub.py @@ -38,14 +38,6 @@ class BoundHubIE(InfoExtractor): # Get uploader url uploader_url = self._search_regex(r'\s*([\s\S]+?)', webpage, 'html_screenshots', fatal=False)): - img = dict() - img['url'] = match.rstrip('/') - img['id'] = int_or_none(os.path.splitext(os.path.basename(img['url']))[0]) - thumbnails.append(img) - return { 'id': self._match_id(url), 'title': self._search_regex(r'\s*

(.*)

', webpage, 'title', default=None) or self._og_search_title(webpage), @@ -55,7 +47,6 @@ class BoundHubIE(InfoExtractor): 'duration': (int(minutes) * 60) + int(seconds), 'ext': self._html_search_regex(r'postfix:\s*[\"\']\.([^\"\']*)[\"\']', webpage, 'ext', fatal=False), 'thumbnail': self._html_search_regex(r'preview_url:\s*[\"\']([^\"\']*)[\"\']', webpage, 'thumbnail', fatal=False), - 'thumbnails': thumbnails, 'uploader': self._search_regex(r'\s*\s*(.*)\s*
', webpage, 'uploader', fatal=False), 'uploader_id': int_or_none(self._html_search_regex(r'https?://(?:www\.)?boundhub\.com/members/(\d+)', uploader_url, 'uploader_id', fatal=False)), 'uploader_url': uploader_url, From 3e60887a2abfd2ec5d812b9f6504e8f68b0519ad Mon Sep 17 00:00:00 2001 From: aapjez Date: Fri, 2 Oct 2020 03:36:36 +0200 Subject: [PATCH 5/6] Clean up code --- youtube_dl/extractor/boundhub.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/boundhub.py b/youtube_dl/extractor/boundhub.py index 3fa791e47..7e65ca73a 100644 --- a/youtube_dl/extractor/boundhub.py +++ b/youtube_dl/extractor/boundhub.py @@ -23,7 +23,7 @@ class BoundHubIE(InfoExtractor): 'thumbnail': 'https://cnt.boundhub.com/contents/videos_screenshots/205000/205969/preview.mp4.jpg', 'uploader': 'Tamina', 'uploader_id': 39278, - 'uploader_url': 'https://www.boundhub.com/members/39278/', + 'uploader_url': 'https://www.boundhub.com/members/39278/' } } @@ -44,11 +44,11 @@ class BoundHubIE(InfoExtractor): 'url': self._search_regex(r'video_url: [\"\']([^\"\']*)[\"\']', webpage, 'url'), 'description': self._search_regex(r'\s*Description:\s*(.*)<\/em>', webpage, 'description', fatal=False), 'display_id': self._html_search_regex(r'https?://(?:www\.)?boundhub\.com/videos/[0-9]+/([\w-]*)', url, 'display_id', fatal=False), - 'duration': (int(minutes) * 60) + int(seconds), + 'duration': int_or_none((int(minutes) * 60) + int(seconds)), 'ext': self._html_search_regex(r'postfix:\s*[\"\']\.([^\"\']*)[\"\']', webpage, 'ext', fatal=False), 'thumbnail': self._html_search_regex(r'preview_url:\s*[\"\']([^\"\']*)[\"\']', webpage, 'thumbnail', fatal=False), 'uploader': self._search_regex(r'\s*\s*(.*)\s*', webpage, 'uploader', fatal=False), 'uploader_id': int_or_none(self._html_search_regex(r'https?://(?:www\.)?boundhub\.com/members/(\d+)', uploader_url, 'uploader_id', fatal=False)), 'uploader_url': uploader_url, - 'views': int_or_none(self._search_regex(r'\s*Views:\s*([\w ]*)', webpage, 'views_text', fatal=False).replace(' ', '')), + 'views': int_or_none(self._search_regex(r'\s*Views:\s*([\w ]*)', webpage, 'views_text', fatal=False).replace(' ', '')) } From 3d0e4c1714e310d59549a7f0d34f13f539e6782a Mon Sep 17 00:00:00 2001 From: aapjez Date: Fri, 2 Oct 2020 03:39:40 +0200 Subject: [PATCH 6/6] Remove unused imports --- youtube_dl/extractor/boundhub.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/boundhub.py b/youtube_dl/extractor/boundhub.py index 7e65ca73a..7de8a6512 100644 --- a/youtube_dl/extractor/boundhub.py +++ b/youtube_dl/extractor/boundhub.py @@ -1,9 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import os -import re - from .common import InfoExtractor from ..utils import int_or_none