# coding: utf-8 from __future__ import unicode_literals import os import re from .common import InfoExtractor from ..utils import int_or_none class BoundHubIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?boundhub\.com/videos/(?P[0-9]+)' _TEST = { 'url': 'https://www.boundhub.com/videos/205969/tamina-in-species-appropriate-cage-system-housing/', 'md5': '6381e2491e6a42cc8e95c529b6da50a8', 'info_dict': { 'id': '205969', 'title': 'Tamina in species-appropriate cage system housing', 'description': 'Tamina in Straitjacket gagged an locked into a small cage for the afternoon.', 'display_id': 'tamina-in-species-appropriate-cage-system-housing', 'duration': 314, 'ext': 'mp4', 'thumbnail': 'https://cnt.boundhub.com/contents/videos_screenshots/205000/205969/preview.mp4.jpg', 'uploader': 'Tamina', 'uploader_id': 39278, 'uploader_url': 'https://www.boundhub.com/members/39278/', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) # Parse duration duration_text = self._search_regex(r'\s*Duration:\s*([\w ]*)', webpage, 'duration_text', fatal=False) minutes = self._html_search_regex(r'(\d*)min', duration_text, 'minutes', fatal=False) seconds = self._html_search_regex(r'(\d*)sec', duration_text, 'seconds', fatal=False) duration = (int(minutes) * 60) + int(seconds) # Parse views views_text = self._search_regex(r'\s*Views:\s*([\w ]*)', webpage, 'views_text', fatal=False) views = int_or_none(views_text.replace(' ', '')) # Get uploader url and id uploader_url = self._search_regex(r'\s*([\s\S]+?)', webpage, 'html_screenshots', fatal=False) regex_screenshots = r'' thumbnails = list() for match in re.findall(regex_screenshots, html_screenshots): img = dict() img['url'] = match[0] img['id'] = int_or_none(os.path.splitext(os.path.basename(img['url']))[0]) img['width'] = int_or_none(match[1]) img['height'] = int_or_none(match[2]) thumbnails.append(img) return { 'id': video_id, 'title': self._search_regex(r'\s*

(.*)

', webpage, 'title', default=None) or self._og_search_title(webpage), 'url': self._search_regex(r'video_url: [\"\']([^\"\']*)[\"\']', webpage, 'url'), 'description': self._search_regex(r'\s*Description:\s*(.*)<\/em>', webpage, 'description', fatal=False), 'display_id': self._html_search_regex(r'https?://(?:www\.)?boundhub\.com/videos/[0-9]+/([\w-]*)', url, 'display_id', fatal=False), 'duration': duration, 'ext': self._html_search_regex(r'postfix:\s*[\"\']\.([^\"\']*)[\"\']', webpage, 'ext', fatal=False), 'thumbnail': self._html_search_regex(r'preview_url:\s*[\"\']([^\"\']*)[\"\']', webpage, 'thumbnail', fatal=False), 'thumbnails': thumbnails, 'uploader': self._search_regex(r'\s*\s*(.*)\s*
', webpage, 'uploader', fatal=False), 'uploader_id': uploader_id, 'uploader_url': uploader_url, 'views': views, }