youtube-dl/youtube_dl/extractor/pornhub.py

# coding: utf-8
from __future__ import unicode_literals

import functools
import itertools
import operator
# import os
import re

from .common import InfoExtractor
from ..compat import (
    compat_HTTPError,
    # compat_urllib_parse_unquote,
    # compat_urllib_parse_unquote_plus,
    # compat_urllib_parse_urlparse,
)
from ..utils import (
    ExtractorError,
    int_or_none,
    js_to_json,
    orderedSet,
    # sanitized_Request,
    remove_quotes,
    str_to_int,
)
# from ..aes import (
#     aes_decrypt_text
# )


class PornHubIE(InfoExtractor):
    IE_DESC = 'PornHub and Thumbzilla'
    _VALID_URL = r'''(?x)
                    https?://
                        (?:
                            (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
                            (?:www\.)?thumbzilla\.com/video/
                        )
                        (?P<id>[\da-z]+)
                    '''
    _TESTS = [{
        'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
        'md5': '1e19b41231a02eba417839222ac9d58e',
        'info_dict': {
            'id': '648719015',
            'ext': 'mp4',
            'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
            'uploader': 'Babes',
            'duration': 361,
            'view_count': int,
            'like_count': int,
            'dislike_count': int,
            'comment_count': int,
            'age_limit': 18,
            'tags': list,
            'categories': list,
        },
    }, {
        # non-ASCII title
        'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
        'info_dict': {
            'id': '1331683002',
            'ext': 'mp4',
            'title': '重庆婷婷女王足交',
            'uploader': 'cj397186295',
            'duration': 1753,
            'view_count': int,
            'like_count': int,
            'dislike_count': int,
            'comment_count': int,
            'age_limit': 18,
            'tags': list,
            'categories': list,
        },
        'params': {
            'skip_download': True,
        },
    }, {
        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
        'only_matching': True,
    }, {
        # removed at the request of cam4.com
        'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
        'only_matching': True,
    }, {
        # removed at the request of the copyright owner
        'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
        'only_matching': True,
    }, {
        # removed by uploader
        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
        'only_matching': True,
    }, {
        # private video
        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
        'only_matching': True,
    }, {
        'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
        'only_matching': True,
    }, {
        'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
        'only_matching': True,
    }]

    @staticmethod
    def _extract_urls(webpage):
        return re.findall(
            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/[\da-z]+)',
            webpage)

    def _extract_count(self, pattern, webpage, name):
        return str_to_int(self._search_regex(
            pattern, webpage, '%s count' % name, fatal=False))

    def _real_extract(self, url):
        video_id = self._match_id(url)

        def dl_webpage(platform):
            return self._download_webpage(
                'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id,
                video_id, headers={
                    'Cookie': 'age_verified=1; platform=%s' % platform,
                })

        webpage = dl_webpage('pc')

        error_msg = self._html_search_regex(
            r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
            webpage, 'error message', default=None, group='error')
        if error_msg:
            error_msg = re.sub(r'\s+', ' ', error_msg)
            raise ExtractorError(
                'PornHub said: %s' % error_msg,
                expected=True, video_id=video_id)

        tv_webpage = dl_webpage('tv')

        assignments = self._search_regex(
            r'(var.+?mediastring.+?)</script>', tv_webpage,
            'encoded url').split(';')

        js_vars = {}

        def parse_js_value(inp):
            inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
            if '+' in inp:
                inps = inp.split('+')
                return functools.reduce(
                    operator.concat, map(parse_js_value, inps))
            inp = inp.strip()
            if inp in js_vars:
                return js_vars[inp]
            return remove_quotes(inp)

        for assn in assignments:
            assn = assn.strip()
            if not assn:
                continue
            assn = re.sub(r'var\s+', '', assn)
            vname, value = assn.split('=', 1)
            js_vars[vname] = parse_js_value(value)

        video_url = js_vars['mediastring']

        title = self._search_regex(
            r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None)

        # video_title from flashvars contains whitespace instead of non-ASCII (see
        # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
        # on that anymore.
        title = title or self._html_search_meta(
            'twitter:title', webpage, default=None) or self._search_regex(
            (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
             r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
             r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
            webpage, 'title', group='title')

        flashvars = self._parse_json(
            self._search_regex(
                r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
            video_id)
        if flashvars:
            thumbnail = flashvars.get('image_url')
            duration = int_or_none(flashvars.get('video_duration'))
        else:
            title, thumbnail, duration = [None] * 3

        video_uploader = self._html_search_regex(
            r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
            webpage, 'uploader', fatal=False)

        view_count = self._extract_count(
            r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
        like_count = self._extract_count(
            r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
        dislike_count = self._extract_count(
            r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
        comment_count = self._extract_count(
            r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')

        page_params = self._parse_json(self._search_regex(
            r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})',
            webpage, 'page parameters', group='data', default='{}'),
            video_id, transform_source=js_to_json, fatal=False)
        tags = categories = None
        if page_params:
            tags = page_params.get('tags', '').split(',')
            categories = page_params.get('categories', '').split(',')

        return {
            'id': video_id,
            'url': video_url,
            'uploader': video_uploader,
            'title': title,
            'thumbnail': thumbnail,
            'duration': duration,
            'view_count': view_count,
            'like_count': like_count,
            'dislike_count': dislike_count,
            'comment_count': comment_count,
            # 'formats': formats,
            'age_limit': 18,
            'tags': tags,
            'categories': categories,
        }


class PornHubPlaylistBaseIE(InfoExtractor):
    def _extract_entries(self, webpage):
        return [
            self.url_result(
                'http://www.pornhub.com/%s' % video_url,
                PornHubIE.ie_key(), video_title=title)
            for video_url, title in orderedSet(re.findall(
                r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
                webpage))
        ]

    def _real_extract(self, url):
        playlist_id = self._match_id(url)

        webpage = self._download_webpage(url, playlist_id)

        # Only process container div with main playlist content skipping
        # drop-down menu that uses similar pattern for videos (see
        # https://github.com/rg3/youtube-dl/issues/11594).
        container = self._search_regex(
            r'(?s)(<div[^>]+class=["\']container.+)', webpage,
            'container', default=webpage)

        entries = self._extract_entries(container)

        playlist = self._parse_json(
            self._search_regex(
                r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'),
            playlist_id)

        return self.playlist_result(
            entries, playlist_id, playlist.get('title'), playlist.get('description'))


class PornHubPlaylistIE(PornHubPlaylistBaseIE):
    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
    _TESTS = [{
        'url': 'http://www.pornhub.com/playlist/4667351',
        'info_dict': {
            'id': '4667351',
            'title': 'Nataly Hot',
        },
        'playlist_mincount': 2,
    }]


class PornHubUserVideosIE(PornHubPlaylistBaseIE):
    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P<id>[^/]+)/videos'
    _TESTS = [{
        'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
        'info_dict': {
            'id': 'zoe_ph',
        },
        'playlist_mincount': 171,
    }, {
        'url': 'http://www.pornhub.com/users/rushandlia/videos',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        user_id = self._match_id(url)

        entries = []
        for page_num in itertools.count(1):
            try:
                webpage = self._download_webpage(
                    url, user_id, 'Downloading page %d' % page_num,
                    query={'page': page_num})
            except ExtractorError as e:
                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
                    break
            page_entries = self._extract_entries(webpage)
            if not page_entries:
                break
            entries.extend(page_entries)

        return self.playlist_result(entries, user_id)
[pornhub] Improve title extraction (Closes #9777) 2016-06-13 23:57:59 +02:00			`# coding: utf-8`
[pornhub] Use centralized sorting 2014-01-07 10:25:34 +01:00			`from __future__ import unicode_literals`

[pornhub] Decode obfuscated video URL (closes #12470) 2017-03-21 00:29:39 +01:00			`import functools`
[pornhub:uservideos] Add support for multipage videos (Closes #9006) 2016-03-27 20:50:46 +02:00			`import itertools`
[pornhub] Decode obfuscated video URL (closes #12470) 2017-03-21 00:29:39 +01:00			`import operator`
[pornhub] Extract video URL from tv platform site (#12007, #12129) 2017-02-14 17:52:41 +01:00			`# import os`
Add support for http://www.pornhub.com 2013-10-27 01:04:22 +02:00			`import re`

			`from .common import InfoExtractor`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 12:24:42 +01:00			`from ..compat import (`
[pornhub:uservideos] Add support for multipage videos (Closes #9006) 2016-03-27 20:50:46 +02:00			`compat_HTTPError,`
[pornhub] Extract video URL from tv platform site (#12007, #12129) 2017-02-14 17:52:41 +01:00			`# compat_urllib_parse_unquote,`
			`# compat_urllib_parse_unquote_plus,`
			`# compat_urllib_parse_urlparse,`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 12:24:42 +01:00			`)`
			`from ..utils import (`
[pornhub] Detect private videos and emit an error message (Closes #4764) 2015-01-22 23:48:58 +01:00			`ExtractorError,`
[pornhub] Fix thumbnail and duration extraction (Closes #8604) 2016-02-19 16:42:46 +01:00			`int_or_none,`
[pornhub] Extract categories and tags (closes #10499) 2016-09-11 13:22:51 +02:00			`js_to_json,`
[pornhub:playlistbase] Use orderedSet 2016-03-27 20:21:08 +02:00			`orderedSet,`
[pornhub] Extract video URL from tv platform site (#12007, #12129) 2017-02-14 17:52:41 +01:00			`# sanitized_Request,`
[pornhub] Improve extraction and style (closes #12515) 2017-03-21 19:59:27 +01:00			`remove_quotes,`
[pornhub] Fix uploader extraction and extract counts 2014-03-22 15:29:01 +01:00			`str_to_int,`
Add support for http://www.pornhub.com 2013-10-27 01:04:22 +02:00			`)`
[pornhub] Extract video URL from tv platform site (#12007, #12129) 2017-02-14 17:52:41 +01:00			`# from ..aes import (`
			`# aes_decrypt_text`
			`# )`
Add support for http://www.pornhub.com 2013-10-27 01:04:22 +02:00
[pornhub] Use centralized sorting 2014-01-07 10:25:34 +01:00
Add support for http://www.pornhub.com 2013-10-27 01:04:22 +02:00			`class PornHubIE(InfoExtractor):`
[pornhub] Add support for thumbzilla (Closes #8696) 2016-07-01 21:11:07 +02:00			`IE_DESC = 'PornHub and Thumbzilla'`
			`_VALID_URL = r'''(?x)`
			`https?://`
			`(?:`
[pornhub] Extend _VALID_URL (closes #12996) 2017-05-05 21:46:37 +02:00			`(?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php\|video/show)\?viewkey=\|embed/)\|`
[pornhub] Add support for thumbzilla (Closes #8696) 2016-07-01 21:11:07 +02:00			`(?:www\.)?thumbzilla\.com/video/`
			`)`
[extractor/generic] Improve support for pornhub embeds (closes #11100) 2016-11-06 15:52:00 +01:00			`(?P<id>[\da-z]+)`
[pornhub] Add support for thumbzilla (Closes #8696) 2016-07-01 21:11:07 +02:00			`'''`
[pornhub] Extend _VALID_URL (Closes #6019) 2015-06-18 18:26:17 +02:00			`_TESTS = [{`
[pornhub] Use centralized sorting 2014-01-07 10:25:34 +01:00			`'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',`
[pornhub] Fix thumbnail and duration extraction (Closes #8604) 2016-02-19 16:42:46 +01:00			`'md5': '1e19b41231a02eba417839222ac9d58e',`
[pornhub] Use centralized sorting 2014-01-07 10:25:34 +01:00			`'info_dict': {`
[pornhub] Modernize and fix test definition 2014-10-27 00:33:35 +01:00			`'id': '648719015',`
			`'ext': 'mp4',`
[refactor] Single quotes consistency 2016-02-14 10:37:17 +01:00			`'title': 'Seductive Indian beauty strips down and fingers her pink pussy',`
[pornhub] Fix thumbnail and duration extraction (Closes #8604) 2016-02-19 16:42:46 +01:00			`'uploader': 'Babes',`
			`'duration': 361,`
			`'view_count': int,`
			`'like_count': int,`
			`'dislike_count': int,`
			`'comment_count': int,`
			`'age_limit': 18,`
[pornhub] Extract categories and tags (closes #10499) 2016-09-11 13:22:51 +02:00			`'tags': list,`
			`'categories': list,`
[pornhub] Improve title extraction (Closes #9777) 2016-06-13 23:57:59 +02:00			`},`
			`}, {`
			`# non-ASCII title`
			`'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',`
			`'info_dict': {`
			`'id': '1331683002',`
			`'ext': 'mp4',`
			`'title': '重庆婷婷女王足交',`
			`'uploader': 'cj397186295',`
			`'duration': 1753,`
			`'view_count': int,`
			`'like_count': int,`
			`'dislike_count': int,`
			`'comment_count': int,`
			`'age_limit': 18,`
[pornhub] Extract categories and tags (closes #10499) 2016-09-11 13:22:51 +02:00			`'tags': list,`
			`'categories': list,`
[pornhub] Improve title extraction (Closes #9777) 2016-06-13 23:57:59 +02:00			`},`
			`'params': {`
			`'skip_download': True,`
			`},`
[pornhub] Extend _VALID_URL (Closes #6019) 2015-06-18 18:26:17 +02:00			`}, {`
			`'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',`
			`'only_matching': True,`
[pornhub] Relax _VALID_URL (Closes #6868) 2015-09-15 17:24:01 +02:00			`}, {`
[pornhub] Add more tests with removed videos 2016-06-30 22:18:27 +02:00			`# removed at the request of cam4.com`
[pornhub] Relax _VALID_URL (Closes #6868) 2015-09-15 17:24:01 +02:00			`'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',`
			`'only_matching': True,`
[pornhub] Add more tests with removed videos 2016-06-30 22:18:27 +02:00			`}, {`
			`# removed at the request of the copyright owner`
			`'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',`
			`'only_matching': True,`
			`}, {`
			`# removed by uploader`
			`'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',`
			`'only_matching': True,`
[pornhub] Detect private videos (Closes #9987) 2016-07-03 22:27:00 +02:00			`}, {`
			`# private video`
			`'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',`
			`'only_matching': True,`
[pornhub] Add support for thumbzilla (Closes #8696) 2016-07-01 21:11:07 +02:00			`}, {`
			`'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',`
			`'only_matching': True,`
[pornhub] Extend _VALID_URL (closes #12996) 2017-05-05 21:46:37 +02:00			`}, {`
			`'url': 'http://www.pornhub.com/video/show?viewkey=648719015',`
			`'only_matching': True,`
[pornhub] Extend _VALID_URL (Closes #6019) 2015-06-18 18:26:17 +02:00			`}]`
Add support for http://www.pornhub.com 2013-10-27 01:04:22 +02:00
[extractor/generic] Improve support for pornhub embeds (closes #11100) 2016-11-06 15:52:00 +01:00			`@staticmethod`
			`def _extract_urls(webpage):`
			`return re.findall(`
			`r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/[\da-z]+)',`
			`webpage)`
[extractor/generic] Add support for pornhub embeds 2015-06-12 23:36:16 +02:00
[pornhub] Fix uploader extraction and extract counts 2014-03-22 15:29:01 +01:00			`def _extract_count(self, pattern, webpage, name):`
[pornhub] Fix comment count extraction (Closes #5320) 2015-03-30 15:41:04 +02:00			`return str_to_int(self._search_regex(`
			`pattern, webpage, '%s count' % name, fatal=False))`
[pornhub] Fix uploader extraction and extract counts 2014-03-22 15:29:01 +01:00
Add support for http://www.pornhub.com 2013-10-27 01:04:22 +02:00			`def _real_extract(self, url):`
[pornhub] Modernize and fix test definition 2014-10-27 00:33:35 +01:00			`video_id = self._match_id(url)`
Add support for http://www.pornhub.com 2013-10-27 01:04:22 +02:00
[pornhub] Extract video URL from tv platform site (#12007, #12129) 2017-02-14 17:52:41 +01:00			`def dl_webpage(platform):`
			`return self._download_webpage(`
			`'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id,`
			`video_id, headers={`
			`'Cookie': 'age_verified=1; platform=%s' % platform,`
			`})`

			`webpage = dl_webpage('pc')`
Add support for http://www.pornhub.com 2013-10-27 01:04:22 +02:00
[pornhub] Detect private videos and emit an error message (Closes #4764) 2015-01-22 23:48:58 +01:00			`error_msg = self._html_search_regex(`
[pornhub] Make error regex less ambiguous (Closes #10138) 2016-07-22 16:24:09 +02:00			`r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)\b(?:removed\|userMessageSection)\b(?:(?!\1).)\1[^>]*>(?P<error>.+?)</div>',`
[pornhub] Relax removed message regex (Closes #9964) 2016-06-30 22:14:23 +02:00			`webpage, 'error message', default=None, group='error')`
[pornhub] Detect private videos and emit an error message (Closes #4764) 2015-01-22 23:48:58 +01:00			`if error_msg:`
			`error_msg = re.sub(r'\s+', ' ', error_msg)`
			`raise ExtractorError(`
			`'PornHub said: %s' % error_msg,`
			`expected=True, video_id=video_id)`

[pornhub] Extract video URL from tv platform site (#12007, #12129) 2017-02-14 17:52:41 +01:00			`tv_webpage = dl_webpage('tv')`

[pornhub] Improve extraction and style (closes #12515) 2017-03-21 19:59:27 +01:00			`assignments = self._search_regex(`
			`r'(var.+?mediastring.+?)</script>', tv_webpage,`
			`'encoded url').split(';')`

[pornhub] Decode obfuscated video URL (closes #12470) 2017-03-21 00:29:39 +01:00			`js_vars = {}`

			`def parse_js_value(inp):`
[pornhub] Improve extraction and style (closes #12515) 2017-03-21 19:59:27 +01:00			`inp = re.sub(r'/\(?:(?!\/).)?\/', '', inp)`
			`if '+' in inp:`
			`inps = inp.split('+')`
			`return functools.reduce(`
			`operator.concat, map(parse_js_value, inps))`
[pornhub] Decode obfuscated video URL (closes #12470) 2017-03-21 00:29:39 +01:00			`inp = inp.strip()`
			`if inp in js_vars:`
			`return js_vars[inp]`
[pornhub] Improve extraction and style (closes #12515) 2017-03-21 19:59:27 +01:00			`return remove_quotes(inp)`
[pornhub] Decode obfuscated video URL (closes #12470) 2017-03-21 00:29:39 +01:00
			`for assn in assignments:`
			`assn = assn.strip()`
[pornhub] Improve extraction and style (closes #12515) 2017-03-21 19:59:27 +01:00			`if not assn:`
[pornhub] Decode obfuscated video URL (closes #12470) 2017-03-21 00:29:39 +01:00			`continue`
[pornhub] Improve extraction and style (closes #12515) 2017-03-21 19:59:27 +01:00			`assn = re.sub(r'var\s+', '', assn)`
			`vname, value = assn.split('=', 1)`
[pornhub] Decode obfuscated video URL (closes #12470) 2017-03-21 00:29:39 +01:00			`js_vars[vname] = parse_js_value(value)`

[pornhub] Improve extraction and style (closes #12515) 2017-03-21 19:59:27 +01:00			`video_url = js_vars['mediastring']`
[pornhub] Extract video URL from tv platform site (#12007, #12129) 2017-02-14 17:52:41 +01:00
			`title = self._search_regex(`
			`r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None)`

[pornhub] Improve title extraction (Closes #9777) 2016-06-13 23:57:59 +02:00			`# video_title from flashvars contains whitespace instead of non-ASCII (see`
			`# http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying`
			`# on that anymore.`
[pornhub] Extract video URL from tv platform site (#12007, #12129) 2017-02-14 17:52:41 +01:00			`title = title or self._html_search_meta(`
[pornhub] Improve title extraction (Closes #9777) 2016-06-13 23:57:59 +02:00			`'twitter:title', webpage, default=None) or self._search_regex(`
			`(r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',`
			`r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',`
			`r'shareTitle\s=\s(["\'])(?P<title>.+?)\1'),`
			`webpage, 'title', group='title')`

[pornhub] Fix thumbnail and duration extraction (Closes #8604) 2016-02-19 16:42:46 +01:00			`flashvars = self._parse_json(`
			`self._search_regex(`
[pornhub] Fix typo (Closes #9008) 2016-03-27 21:21:44 +02:00			`r'var\s+flashvars_\d+\s=\s({.+?});', webpage, 'flashvars', default='{}'),`
[pornhub] Fix thumbnail and duration extraction (Closes #8604) 2016-02-19 16:42:46 +01:00			`video_id)`
			`if flashvars:`
			`thumbnail = flashvars.get('image_url')`
			`duration = int_or_none(flashvars.get('video_duration'))`
			`else:`
[pornhub] Improve title extraction (Closes #9777) 2016-06-13 23:57:59 +02:00			`title, thumbnail, duration = [None] * 3`
[pornhub] Fix thumbnail and duration extraction (Closes #8604) 2016-02-19 16:42:46 +01:00
[pornhub] Fix uploader extraction and extract counts 2014-03-22 15:29:01 +01:00			`video_uploader = self._html_search_regex(`
[pornhub] Fix uploader regex 2015-02-19 17:15:49 +01:00			`r'(?s)From: .+?<(?:a href="/users/\|a href="/channels/\|span class="username)[^>]+>(.+?)<',`
[pornhub] Fix uploader extraction and extract counts 2014-03-22 15:29:01 +01:00			`webpage, 'uploader', fatal=False)`
Add support for http://www.pornhub.com 2013-10-27 01:04:22 +02:00
[pornhub] Fix comment count extraction (Closes #5320) 2015-03-30 15:41:04 +02:00			`view_count = self._extract_count(`
			`r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')`
			`like_count = self._extract_count(`
			`r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')`
			`dislike_count = self._extract_count(`
			`r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')`
[pornhub] Fix uploader extraction and extract counts 2014-03-22 15:29:01 +01:00			`comment_count = self._extract_count(`
[pornhub] Fix comment count extraction (Closes #5320) 2015-03-30 15:41:04 +02:00			`r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')`
[pornhub] Fix uploader extraction and extract counts 2014-03-22 15:29:01 +01:00
[pornhub] Extract categories and tags (closes #10499) 2016-09-11 13:22:51 +02:00			`page_params = self._parse_json(self._search_regex(`
			`r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s=\s(?P<data>{[^}]+})',`
			`webpage, 'page parameters', group='data', default='{}'),`
			`video_id, transform_source=js_to_json, fatal=False)`
			`tags = categories = None`
			`if page_params:`
			`tags = page_params.get('tags', '').split(',')`
			`categories = page_params.get('categories', '').split(',')`

Add support for http://www.pornhub.com 2013-10-27 01:04:22 +02:00			`return {`
			`'id': video_id,`
[pornhub] Extract video URL from tv platform site (#12007, #12129) 2017-02-14 17:52:41 +01:00			`'url': video_url,`
Add support for http://www.pornhub.com 2013-10-27 01:04:22 +02:00			`'uploader': video_uploader,`
[pornhub] Improve title extraction (Closes #9777) 2016-06-13 23:57:59 +02:00			`'title': title,`
Add support for http://www.pornhub.com 2013-10-27 01:04:22 +02:00			`'thumbnail': thumbnail,`
[pornhub] Fix thumbnail and duration extraction (Closes #8604) 2016-02-19 16:42:46 +01:00			`'duration': duration,`
[pornhub] Fix uploader extraction and extract counts 2014-03-22 15:29:01 +01:00			`'view_count': view_count,`
			`'like_count': like_count,`
			`'dislike_count': dislike_count,`
			`'comment_count': comment_count,`
[pornhub] Extract video URL from tv platform site (#12007, #12129) 2017-02-14 17:52:41 +01:00			`# 'formats': formats,`
Add the missing age_limit tags; added a devscript to do a superficial check for porn sites without the age_limit tag in the test 2013-10-28 06:50:17 +01:00			`'age_limit': 18,`
[pornhub] Extract categories and tags (closes #10499) 2016-09-11 13:22:51 +02:00			`'tags': tags,`
			`'categories': categories,`
Add support for http://www.pornhub.com 2013-10-27 01:04:22 +02:00			`}`
[pornhub] Add support for playlists (Closes #4995) 2015-02-19 17:15:19 +01:00

[pornhub:user:videos] Add extractor (Closes #8548) 2016-02-18 17:29:17 +01:00			`class PornHubPlaylistBaseIE(InfoExtractor):`
			`def _extract_entries(self, webpage):`
			`return [`
[pornhub:playlistbase] Do not include videos not from playlist 2016-03-27 20:32:57 +02:00			`self.url_result(`
			`'http://www.pornhub.com/%s' % video_url,`
			`PornHubIE.ie_key(), video_title=title)`
			`for video_url, title in orderedSet(re.findall(`
			`r'href="/?(view_video\.php\?.\bviewkey=[\da-z]+[^"])"[^>]*\s+title="([^"]+)"',`
			`webpage))`
[pornhub:user:videos] Add extractor (Closes #8548) 2016-02-18 17:29:17 +01:00			`]`
[pornhub] Add support for playlists (Closes #4995) 2015-02-19 17:15:19 +01:00
			`def _real_extract(self, url):`
			`playlist_id = self._match_id(url)`

			`webpage = self._download_webpage(url, playlist_id)`

[pornhub:playlist] Improve extraction (closes #11594) 2017-01-03 23:32:18 +01:00			`# Only process container div with main playlist content skipping`
			`# drop-down menu that uses similar pattern for videos (see`
			`# https://github.com/rg3/youtube-dl/issues/11594).`
			`container = self._search_regex(`
			`r'(?s)(<div[^>]+class=["\']container.+)', webpage,`
			`'container', default=webpage)`

			`entries = self._extract_entries(container)`
[pornhub] Add support for playlists (Closes #4995) 2015-02-19 17:15:19 +01:00
			`playlist = self._parse_json(`
			`self._search_regex(`
			`r'playlistObject\s=\s({.+?});', webpage, 'playlist'),`
			`playlist_id)`

			`return self.playlist_result(`
			`entries, playlist_id, playlist.get('title'), playlist.get('description'))`
[pornhub:user:videos] Add extractor (Closes #8548) 2016-02-18 17:29:17 +01:00

			`class PornHubPlaylistIE(PornHubPlaylistBaseIE):`
			`_VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'`
			`_TESTS = [{`
[pornhub:playlist] Improve extraction (closes #11594) 2017-01-03 23:32:18 +01:00			`'url': 'http://www.pornhub.com/playlist/4667351',`
[pornhub:user:videos] Add extractor (Closes #8548) 2016-02-18 17:29:17 +01:00			`'info_dict': {`
[pornhub:playlist] Improve extraction (closes #11594) 2017-01-03 23:32:18 +01:00			`'id': '4667351',`
			`'title': 'Nataly Hot',`
[pornhub:user:videos] Add extractor (Closes #8548) 2016-02-18 17:29:17 +01:00			`},`
[pornhub:playlist] Improve extraction (closes #11594) 2017-01-03 23:32:18 +01:00			`'playlist_mincount': 2,`
[pornhub:user:videos] Add extractor (Closes #8548) 2016-02-18 17:29:17 +01:00			`}]`


			`class PornHubUserVideosIE(PornHubPlaylistBaseIE):`
			`_VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P<id>[^/]+)/videos'`
			`_TESTS = [{`
[pornhub:uservideos] Add support for multipage videos (Closes #9006) 2016-03-27 20:50:46 +02:00			`'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',`
[pornhub:user:videos] Add extractor (Closes #8548) 2016-02-18 17:29:17 +01:00			`'info_dict': {`
[pornhub:uservideos] Add support for multipage videos (Closes #9006) 2016-03-27 20:50:46 +02:00			`'id': 'zoe_ph',`
[pornhub:user:videos] Add extractor (Closes #8548) 2016-02-18 17:29:17 +01:00			`},`
[pornhub:uservideos] Add support for multipage videos (Closes #9006) 2016-03-27 20:50:46 +02:00			`'playlist_mincount': 171,`
			`}, {`
			`'url': 'http://www.pornhub.com/users/rushandlia/videos',`
			`'only_matching': True,`
[pornhub:user:videos] Add extractor (Closes #8548) 2016-02-18 17:29:17 +01:00			`}]`

			`def _real_extract(self, url):`
			`user_id = self._match_id(url)`

[pornhub:uservideos] Add support for multipage videos (Closes #9006) 2016-03-27 20:50:46 +02:00			`entries = []`
			`for page_num in itertools.count(1):`
			`try:`
			`webpage = self._download_webpage(`
			`url, user_id, 'Downloading page %d' % page_num,`
			`query={'page': page_num})`
			`except ExtractorError as e:`
			`if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:`
			`break`
			`page_entries = self._extract_entries(webpage)`
			`if not page_entries:`
			`break`
			`entries.extend(page_entries)`

			`return self.playlist_result(entries, user_id)`