mirror of
https://codeberg.org/polarisfm/youtube-dl
synced 2024-11-22 16:44:32 +01:00
[sexdotcom] Add new extractor
Three different cases are handled: 1. Most video pages use the Video.js framework. I extract the arguments of the JavaScript `player.updateSrc()` call to get the video details (one or two different formats) 2. Some video pages include an iframe which embeds a video from another site, such as YouTube, XHamster, XVideos and possibly more. 3. Also supports `<img>` tag for gifs or plain simple pictures. This does not yet extract any metadata besides the essentials.
This commit is contained in:
parent
cf80ff186e
commit
56493ba5b1
@ -973,6 +973,7 @@ from .senateisvp import SenateISVPIE
|
|||||||
from .sendtonews import SendtoNewsIE
|
from .sendtonews import SendtoNewsIE
|
||||||
from .servus import ServusIE
|
from .servus import ServusIE
|
||||||
from .sevenplus import SevenPlusIE
|
from .sevenplus import SevenPlusIE
|
||||||
|
from .sexdotcom import SexDotComIE
|
||||||
from .sexu import SexuIE
|
from .sexu import SexuIE
|
||||||
from .seznamzpravy import (
|
from .seznamzpravy import (
|
||||||
SeznamZpravyIE,
|
SeznamZpravyIE,
|
||||||
|
166
youtube_dl/extractor/sexdotcom.py
Normal file
166
youtube_dl/extractor/sexdotcom.py
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from .common import InfoExtractor
|
||||||
|
from ..utils import ExtractorError, js_to_json, urljoin
|
||||||
|
|
||||||
|
|
||||||
|
class SexDotComIE(InfoExtractor):
|
||||||
|
IE_DESC = 'Sex.com'
|
||||||
|
_VALID_URL = r'https?://(?:www\.)?sex\.com/pin/(?P<id>[0-9]+)'
|
||||||
|
_TESTS = [{
|
||||||
|
# Direct video, two formats
|
||||||
|
'url': 'https://www.sex.com/pin/55064004-jessica-nigri-cosplay/',
|
||||||
|
'md5': 'd1c14632c1c453ee680c94533bc01321',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '55064004',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Jessica Nigri Cosplay',
|
||||||
|
'formats': [
|
||||||
|
{'ext': 'mp4', 'format_id': 'SD', 'height': 360},
|
||||||
|
{'ext': 'mp4', 'format_id': 'HD', 'height': 720},
|
||||||
|
],
|
||||||
|
'age_limit': 18,
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
# Embedded youtube.com
|
||||||
|
'url': 'https://www.sex.com/pin/54022740-bella-hadid-cannes-2016-red-dress-hd/',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'bZgp8wOEaKY',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'upload_date': '20160526',
|
||||||
|
'title': 'Bella Hadid robe tapis rouge Cannes 2016',
|
||||||
|
'uploader': 'Véronique ESPINASSE',
|
||||||
|
'description': 'Innocente beauté...',
|
||||||
|
'uploader_id': 'UCIgl6XJAreFl0wwaN1wm3Pg',
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
# Embedded xhamster.com
|
||||||
|
'url': 'https://www.sex.com/pin/56529538-flawless-tease/',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '8461824',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'shorts tease',
|
||||||
|
'age_limit': 18,
|
||||||
|
'upload_date': '20171031',
|
||||||
|
'uploader': 'anatolio',
|
||||||
|
'timestamp': 1509470696,
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
# gif
|
||||||
|
'url': 'https://www.sex.com/pin/55000357-cat-4/',
|
||||||
|
'md5': 'bba5e04e0555e928852b3ab05c93f1a9',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '55000357',
|
||||||
|
'ext': 'gif',
|
||||||
|
'title': 'cat 4',
|
||||||
|
'age_limit': 18,
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
# image
|
||||||
|
'url': 'https://www.sex.com/pin/35756851/',
|
||||||
|
'md5': 'fc174f007933b0cae74682ca3532e423',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '35756851',
|
||||||
|
'ext': 'jpg',
|
||||||
|
'title': 'Pin #35756851',
|
||||||
|
'age_limit': 18,
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
}]
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
video_id = self._match_id(url)
|
||||||
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
|
title = self._html_search_regex(
|
||||||
|
r'(?s)<h1[^>]*>.*?<span[^>]+(["\']?)name\1[^>]*>(?P<title>.+?)</span>.*</h1>',
|
||||||
|
webpage, 'title', group='title', fatal=False)
|
||||||
|
if not title:
|
||||||
|
# We prefer the above title; 'og:title' also contains uploader name
|
||||||
|
title = self._og_search_title(webpage)
|
||||||
|
|
||||||
|
# Direct video
|
||||||
|
formats = self._try_videojs(webpage, url, video_id)
|
||||||
|
if formats:
|
||||||
|
return {
|
||||||
|
'id': video_id,
|
||||||
|
'title': title,
|
||||||
|
'formats': formats,
|
||||||
|
'age_limit': 18,
|
||||||
|
}
|
||||||
|
|
||||||
|
# It's fairly difficult to distinguish between the various advert
|
||||||
|
# <iframe>s and the one embedding real content.
|
||||||
|
# So we find the "image_frame" div first, content is always its child.
|
||||||
|
container = self._search_regex(
|
||||||
|
r'(?s)<div[^>]+class=(["\']?)image_frame\1[^>]*>(?P<content>.+?)</div>',
|
||||||
|
webpage, 'content container', group='content')
|
||||||
|
|
||||||
|
# Try embedded content iframe; YouTube, XHamster, XVideos and more.
|
||||||
|
mobj = re.search(
|
||||||
|
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.*?)\1', container)
|
||||||
|
if mobj:
|
||||||
|
return self.url_result(
|
||||||
|
mobj.group('url'), video_id=video_id, video_title=title)
|
||||||
|
|
||||||
|
# Try image or gif
|
||||||
|
mobj = re.search(r'<img[^>]+src=(["\'])(?P<url>.+?)\1', container)
|
||||||
|
if mobj:
|
||||||
|
img_url = urljoin(url, mobj.group('url'))
|
||||||
|
return {
|
||||||
|
'id': video_id,
|
||||||
|
'title': title,
|
||||||
|
'url': img_url,
|
||||||
|
'http_headers': {
|
||||||
|
'Referer': url,
|
||||||
|
},
|
||||||
|
'age_limit': 18,
|
||||||
|
}
|
||||||
|
|
||||||
|
raise ExtractorError('%s: Cannot identify content in container' % video_id)
|
||||||
|
|
||||||
|
def _try_videojs(self, webpage, url, video_id):
|
||||||
|
"""Parse arguments of videojs updateSrc() method call."""
|
||||||
|
|
||||||
|
mobj = re.search(r'[a-z]\.updateSrc\s*\((?P<sources>[^)]+?)\)', webpage)
|
||||||
|
if not mobj:
|
||||||
|
return None
|
||||||
|
|
||||||
|
sources = self._parse_json(
|
||||||
|
mobj.group('sources'), video_id=video_id,
|
||||||
|
transform_source=js_to_json)
|
||||||
|
|
||||||
|
formats = []
|
||||||
|
for source in sources:
|
||||||
|
if 'src' not in source or 'type' not in source:
|
||||||
|
self.report_warning(
|
||||||
|
'%s: Unable to handle source: %r' % (video_id, source))
|
||||||
|
continue
|
||||||
|
|
||||||
|
_type = source['type']
|
||||||
|
formats.append({
|
||||||
|
'url': urljoin(url, source['src']),
|
||||||
|
'ext': _type[6:] if _type.startswith('video/') else None,
|
||||||
|
'format_id': source.get('label'),
|
||||||
|
'height': source.get('res'),
|
||||||
|
})
|
||||||
|
|
||||||
|
self._sort_formats(formats)
|
||||||
|
return formats
|
Loading…
Reference in New Issue
Block a user