1
0
mirror of https://codeberg.org/polarisfm/youtube-dl synced 2024-11-22 16:44:32 +01:00

[sexdotcom] Add new extractor

Three different cases are handled:
1. Most video pages use the Video.js framework. I extract the arguments
   of the JavaScript `player.updateSrc()` call to get the video details
   (one or two different formats)
2. Some video pages include an iframe which embeds a video from another
   site, such as YouTube, XHamster, XVideos and possibly more.
3. Also supports `<img>` tag for gifs or plain simple pictures.

This does not yet extract any metadata besides the essentials.
This commit is contained in:
Aksel Eromeeter 2019-10-22 03:32:50 +03:00
parent cf80ff186e
commit 56493ba5b1
2 changed files with 167 additions and 0 deletions

View File

@ -973,6 +973,7 @@ from .senateisvp import SenateISVPIE
from .sendtonews import SendtoNewsIE from .sendtonews import SendtoNewsIE
from .servus import ServusIE from .servus import ServusIE
from .sevenplus import SevenPlusIE from .sevenplus import SevenPlusIE
from .sexdotcom import SexDotComIE
from .sexu import SexuIE from .sexu import SexuIE
from .seznamzpravy import ( from .seznamzpravy import (
SeznamZpravyIE, SeznamZpravyIE,

View File

@ -0,0 +1,166 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import ExtractorError, js_to_json, urljoin
class SexDotComIE(InfoExtractor):
IE_DESC = 'Sex.com'
_VALID_URL = r'https?://(?:www\.)?sex\.com/pin/(?P<id>[0-9]+)'
_TESTS = [{
# Direct video, two formats
'url': 'https://www.sex.com/pin/55064004-jessica-nigri-cosplay/',
'md5': 'd1c14632c1c453ee680c94533bc01321',
'info_dict': {
'id': '55064004',
'ext': 'mp4',
'title': 'Jessica Nigri Cosplay',
'formats': [
{'ext': 'mp4', 'format_id': 'SD', 'height': 360},
{'ext': 'mp4', 'format_id': 'HD', 'height': 720},
],
'age_limit': 18,
},
'params': {
'skip_download': True,
},
}, {
# Embedded youtube.com
'url': 'https://www.sex.com/pin/54022740-bella-hadid-cannes-2016-red-dress-hd/',
'info_dict': {
'id': 'bZgp8wOEaKY',
'ext': 'mp4',
'upload_date': '20160526',
'title': 'Bella Hadid robe tapis rouge Cannes 2016',
'uploader': 'Véronique ESPINASSE',
'description': 'Innocente beauté...',
'uploader_id': 'UCIgl6XJAreFl0wwaN1wm3Pg',
},
'params': {
'skip_download': True,
},
}, {
# Embedded xhamster.com
'url': 'https://www.sex.com/pin/56529538-flawless-tease/',
'info_dict': {
'id': '8461824',
'ext': 'mp4',
'title': 'shorts tease',
'age_limit': 18,
'upload_date': '20171031',
'uploader': 'anatolio',
'timestamp': 1509470696,
},
'params': {
'skip_download': True,
},
}, {
# gif
'url': 'https://www.sex.com/pin/55000357-cat-4/',
'md5': 'bba5e04e0555e928852b3ab05c93f1a9',
'info_dict': {
'id': '55000357',
'ext': 'gif',
'title': 'cat 4',
'age_limit': 18,
},
'params': {
'skip_download': True,
},
}, {
# image
'url': 'https://www.sex.com/pin/35756851/',
'md5': 'fc174f007933b0cae74682ca3532e423',
'info_dict': {
'id': '35756851',
'ext': 'jpg',
'title': 'Pin #35756851',
'age_limit': 18,
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
r'(?s)<h1[^>]*>.*?<span[^>]+(["\']?)name\1[^>]*>(?P<title>.+?)</span>.*</h1>',
webpage, 'title', group='title', fatal=False)
if not title:
# We prefer the above title; 'og:title' also contains uploader name
title = self._og_search_title(webpage)
# Direct video
formats = self._try_videojs(webpage, url, video_id)
if formats:
return {
'id': video_id,
'title': title,
'formats': formats,
'age_limit': 18,
}
# It's fairly difficult to distinguish between the various advert
# <iframe>s and the one embedding real content.
# So we find the "image_frame" div first, content is always its child.
container = self._search_regex(
r'(?s)<div[^>]+class=(["\']?)image_frame\1[^>]*>(?P<content>.+?)</div>',
webpage, 'content container', group='content')
# Try embedded content iframe; YouTube, XHamster, XVideos and more.
mobj = re.search(
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.*?)\1', container)
if mobj:
return self.url_result(
mobj.group('url'), video_id=video_id, video_title=title)
# Try image or gif
mobj = re.search(r'<img[^>]+src=(["\'])(?P<url>.+?)\1', container)
if mobj:
img_url = urljoin(url, mobj.group('url'))
return {
'id': video_id,
'title': title,
'url': img_url,
'http_headers': {
'Referer': url,
},
'age_limit': 18,
}
raise ExtractorError('%s: Cannot identify content in container' % video_id)
def _try_videojs(self, webpage, url, video_id):
"""Parse arguments of videojs updateSrc() method call."""
mobj = re.search(r'[a-z]\.updateSrc\s*\((?P<sources>[^)]+?)\)', webpage)
if not mobj:
return None
sources = self._parse_json(
mobj.group('sources'), video_id=video_id,
transform_source=js_to_json)
formats = []
for source in sources:
if 'src' not in source or 'type' not in source:
self.report_warning(
'%s: Unable to handle source: %r' % (video_id, source))
continue
_type = source['type']
formats.append({
'url': urljoin(url, source['src']),
'ext': _type[6:] if _type.startswith('video/') else None,
'format_id': source.get('label'),
'height': source.get('res'),
})
self._sort_formats(formats)
return formats