From 24fa01bb5cabd878c80d2664b4ee4f2546a3a969 Mon Sep 17 00:00:00 2001 From: Tristan Waddington Date: Sat, 7 Mar 2020 21:17:50 -0800 Subject: [PATCH] Added login support for PornHub and PornHub Premium. The pornhub extractor has been updated with support for --netrc and --username/password authentication. This change allows authenticated users to archive content they have purchased. --- youtube_dl/extractor/pornhub.py | 79 ++++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 3567a3283..60f97a203 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -22,6 +22,7 @@ from ..utils import ( remove_quotes, str_to_int, url_or_none, + urlencode_postdata, ) @@ -46,6 +47,63 @@ class PornHubBaseIE(InfoExtractor): return webpage, urlh + def _login_if_required(self, host): + login_info = self._get_login_info(netrc_machine=host.split('.')[0]) + + # PornHub Premium requires some kind of authentication + if 'premium' in host: + cookie_file = self._downloader.params.get('cookiefile') + if not cookie_file and not all(login_info): + raise ExtractorError( + 'PornHub Premium requires authentication.' + ' You may want to use --cookies or --netrc.', + expected=True) + + # Authenticate, if required + cookies = self._get_cookies('https://%s' % host) + if all(login_info) and not cookies: + self._login(host, login_info) + + def _login(self, host, login_info): + username = login_info[0] + password = login_info[1] + + if 'premium' in host: + login_form_url = 'https://%s/premium/login' % host + login_post_url = 'https://www.%s/front/authenticate' % host + else: + login_form_url = 'https://%s/login' % host + login_post_url = 'https://www.%s/front/authenticate' % host + + # Fetch login page + login_page = self._download_webpage( + login_form_url, video_id=None, note='Fetching login page', tries=3, fatal=True) + + # Fetch login form + login_form = self._hidden_inputs(login_page) + login_form.update({ + 'username': username, + 'password': password, + }) + + # Submit sign-in request + response = self._download_json( + login_post_url, video_id=None, note='Logging in to %s' % host, fatal=True, + data=urlencode_postdata(login_form), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': login_form_url, + }) + + # Success + if response.get('success') == '1': + return self.to_screen("Successfully authenticated") + + # Error + login_error = response.get('message') + if login_error: + raise ExtractorError('Unable to login: %s' % login_error, expected=True) + self.report_warning('Login has probably failed') + class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' @@ -169,15 +227,11 @@ class PornHubIE(PornHubBaseIE): host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') - if 'premium' in host: - if not self._downloader.params.get('cookiefile'): - raise ExtractorError( - 'PornHub Premium requires authentication.' - ' You may want to use --cookies.', - expected=True) - self._set_cookie(host, 'age_verified', '1') + # Authenticate, if required + self._login_if_required(host) + def dl_webpage(platform): self._set_cookie(host, 'platform', platform) return self._download_webpage( @@ -398,6 +452,9 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): host = mobj.group('host') playlist_id = mobj.group('id') + # Authenticate, if required + self._login_if_required(host) + webpage = self._download_webpage(url, playlist_id) entries = self._extract_entries(webpage, host) @@ -438,7 +495,12 @@ class PornHubUserIE(PornHubPlaylistBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') user_id = mobj.group('id') + + # Authenticate, if required + self._login_if_required(host) + return self.url_result( '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id) @@ -459,6 +521,9 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): host = mobj.group('host') item_id = mobj.group('id') + # Authenticate, if required + self._login_if_required(host) + page = int_or_none(self._search_regex( r'\bpage=(\d+)', url, 'page', default=None))