[sexdotcom] Add new extractor

Three different cases are handled: 1. Most video pages use the Video.js framework. I extract the arguments of the JavaScript `player.updateSrc()` call to get the video details (one or two different formats) 2. Some video pages include an iframe which embeds a video from another site, such as YouTube, XHamster, XVideos and possibly more. 3. Also supports `<img>` tag for gifs or plain simple pictures. This does not yet extract any metadata besides the essentials.
2025-02-18 18:17:55 +01:00 · 2019-10-22 03:32:50 +03:00 · 2019-10-22 03:32:50 +03:00 · 56493ba5b1
commit 56493ba5b1
parent cf80ff186e
2 changed files with 167 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -973,6 +973,7 @@ from .senateisvp import SenateISVPIE
 from .sendtonews import SendtoNewsIE
 from .servus import ServusIE
 from .sevenplus import SevenPlusIE
+from .sexdotcom import SexDotComIE
 from .sexu import SexuIE
 from .seznamzpravy import (
    SeznamZpravyIE,
--- a/youtube_dl/extractor/sexdotcom.py
+++ b/youtube_dl/extractor/sexdotcom.py
@ -0,0 +1,166 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError, js_to_json, urljoin
+
+
+class SexDotComIE(InfoExtractor):
+    IE_DESC = 'Sex.com'
+    _VALID_URL = r'https?://(?:www\.)?sex\.com/pin/(?P<id>[0-9]+)'
+    _TESTS = [{
+        # Direct video, two formats
+        'url': 'https://www.sex.com/pin/55064004-jessica-nigri-cosplay/',
+        'md5': 'd1c14632c1c453ee680c94533bc01321',
+        'info_dict': {
+            'id': '55064004',
+            'ext': 'mp4',
+            'title': 'Jessica Nigri Cosplay',
+            'formats': [
+                {'ext': 'mp4', 'format_id': 'SD', 'height': 360},
+                {'ext': 'mp4', 'format_id': 'HD', 'height': 720},
+            ],
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # Embedded youtube.com
+        'url': 'https://www.sex.com/pin/54022740-bella-hadid-cannes-2016-red-dress-hd/',
+        'info_dict': {
+            'id': 'bZgp8wOEaKY',
+            'ext': 'mp4',
+            'upload_date': '20160526',
+            'title': 'Bella Hadid robe tapis rouge Cannes 2016',
+            'uploader': 'Véronique ESPINASSE',
+            'description': 'Innocente beauté...',
+            'uploader_id': 'UCIgl6XJAreFl0wwaN1wm3Pg',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # Embedded xhamster.com
+        'url': 'https://www.sex.com/pin/56529538-flawless-tease/',
+        'info_dict': {
+            'id': '8461824',
+            'ext': 'mp4',
+            'title': 'shorts tease',
+            'age_limit': 18,
+            'upload_date': '20171031',
+            'uploader': 'anatolio',
+            'timestamp': 1509470696,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # gif
+        'url': 'https://www.sex.com/pin/55000357-cat-4/',
+        'md5': 'bba5e04e0555e928852b3ab05c93f1a9',
+        'info_dict': {
+            'id': '55000357',
+            'ext': 'gif',
+            'title': 'cat 4',
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # image
+        'url': 'https://www.sex.com/pin/35756851/',
+        'md5': 'fc174f007933b0cae74682ca3532e423',
+        'info_dict': {
+            'id': '35756851',
+            'ext': 'jpg',
+            'title': 'Pin #35756851',
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(
+            r'(?s)<h1[^>]*>.*?<span[^>]+(["\']?)name\1[^>]*>(?P<title>.+?)</span>.*</h1>',
+            webpage, 'title', group='title', fatal=False)
+        if not title:
+            # We prefer the above title; 'og:title' also contains uploader name
+            title = self._og_search_title(webpage)
+
+        # Direct video
+        formats = self._try_videojs(webpage, url, video_id)
+        if formats:
+            return {
+                'id': video_id,
+                'title': title,
+                'formats': formats,
+                'age_limit': 18,
+            }
+
+        # It's fairly difficult to distinguish between the various advert
+        # <iframe>s and the one embedding real content.
+        # So we find the "image_frame" div first, content is always its child.
+        container = self._search_regex(
+            r'(?s)<div[^>]+class=(["\']?)image_frame\1[^>]*>(?P<content>.+?)</div>',
+            webpage, 'content container', group='content')
+
+        # Try embedded content iframe; YouTube, XHamster, XVideos and more.
+        mobj = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.*?)\1', container)
+        if mobj:
+            return self.url_result(
+                mobj.group('url'), video_id=video_id, video_title=title)
+
+        # Try image or gif
+        mobj = re.search(r'<img[^>]+src=(["\'])(?P<url>.+?)\1', container)
+        if mobj:
+            img_url = urljoin(url, mobj.group('url'))
+            return {
+                'id': video_id,
+                'title': title,
+                'url': img_url,
+                'http_headers': {
+                    'Referer': url,
+                },
+                'age_limit': 18,
+            }
+
+        raise ExtractorError('%s: Cannot identify content in container' % video_id)
+
+    def _try_videojs(self, webpage, url, video_id):
+        """Parse arguments of videojs updateSrc() method call."""
+
+        mobj = re.search(r'[a-z]\.updateSrc\s*\((?P<sources>[^)]+?)\)', webpage)
+        if not mobj:
+            return None
+
+        sources = self._parse_json(
+            mobj.group('sources'), video_id=video_id,
+            transform_source=js_to_json)
+
+        formats = []
+        for source in sources:
+            if 'src' not in source or 'type' not in source:
+                self.report_warning(
+                    '%s: Unable to handle source: %r' % (video_id, source))
+                continue
+
+            _type = source['type']
+            formats.append({
+                'url': urljoin(url, source['src']),
+                'ext': _type[6:] if _type.startswith('video/') else None,
+                'format_id': source.get('label'),
+                'height': source.get('res'),
+            })
+
+        self._sort_formats(formats)
+        return formats