From ea0a8811577cbbe8807ad0560795332b11381839 Mon Sep 17 00:00:00 2001 From: Yarn Date: Tue, 21 Apr 2020 15:21:17 -0700 Subject: [PATCH 1/4] Update twitcasting extractor and add twitcasting history extractor --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/twitcasting.py | 101 +++++++++++++++++++++++++--- 2 files changed, 92 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ef803b8a7..8f2fac787 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1214,7 +1214,7 @@ from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE from .twentymin import TwentyMinutenIE from .twentythreevideo import TwentyThreeVideoIE -from .twitcasting import TwitCastingIE +from .twitcasting import TwitCastingIE, TwitCastingHistoryIE from .twitch import ( TwitchVideoIE, TwitchChapterIE, diff --git a/youtube_dl/extractor/twitcasting.py b/youtube_dl/extractor/twitcasting.py index 2dbe89f5b..d0bed5b0a 100644 --- a/youtube_dl/extractor/twitcasting.py +++ b/youtube_dl/extractor/twitcasting.py @@ -1,11 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import re +import itertools + from .common import InfoExtractor from ..utils import urlencode_postdata -import re - class TwitCastingIE(InfoExtractor): _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P[^/]+)/movie/(?P\d+)' @@ -56,15 +57,25 @@ class TwitCastingIE(InfoExtractor): r'(?s)<[^>]+id=["\']movietitle[^>]+>(.+?)(?:(?!\1).)+)\1', - r'(["\'])(?Phttp.+?\.m3u8.*?)\1'), - webpage, 'm3u8 url', group='url') - - formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + # m3u8_url = self._search_regex( + # (r'data-movie-url=(["\'])(?P(?:(?!\1).)+)\1', + # r'(["\'])(?Phttp.+?\.m3u8.*?)\1'), + # webpage, 'm3u8 url', group='url') + # m3u8_url = m3u8_url.replace('\\/', '/') + # formats = self._extract_m3u8_formats( + # m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + # m3u8_id='hls') + formats = [ + { + 'url': "http://dl01.twitcasting.tv/{uploader_id}/download/{video_id}?dl=1".format(uploader_id=uploader_id, video_id=video_id), + 'ext': 'mp4', + } + ] thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description( @@ -79,3 +90,73 @@ class TwitCastingIE(InfoExtractor): 'uploader_id': uploader_id, 'formats': formats, } + + +class TwitCastingHistoryIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P[^/]+)/show' + _TESTS = [ + { + 'url': 'https://twitcasting.tv/mttbernardini/show/', + 'info_dict': { + 'title': 'Matteo Bernardini', + 'id': 'mttbernardini', + }, + 'playlist_count': 1, + }, + ] + + def _get_meta_and_entries(self, url): + for page_num in itertools.count(0): + page_url = "{}/{}".format(url.rstrip('/'), page_num) + pagenum = None + list_id = None + webpage = self._download_webpage( + page_url, list_id, + 'Downloading page %s' % pagenum) + + if page_num == 0: + # title = re.search(r'(.*)', webpage) + title = re.search(r'(?s)<[^>]+class=["\']tw-user-nav-name[^>]+>(.+?)1') != -1 + if page_num != 0 and first_page_selected: + break + + matches = re.finditer(r''']+class=["']tw-movie-thumbnail["'][^>]+href="(.+)"[^>]+>((?:\n|.)*?)''', webpage) + matches = list(matches) + + for match in matches: + href = match.group(1) + inner = match.group(2) + # if REC isn't present either a live broadcast or an image + # e.g. https://twitcasting.tv/marrynontan/movie/506296434 + if 'REC' not in inner: + continue + + # skip videos that require a password + # e.g. https://twitcasting.tv/mttbernardini/movie/3689740 + locked = re.search(r'''src="/img/locked.png"''', inner) + if locked is not None: + continue + + title = re.search(r'''<[^>]+class=["']tw-movie-thumbnail-title[^>]+>[ \n]*?(.+?) *? Date: Fri, 18 Sep 2020 02:12:20 -0700 Subject: [PATCH 2/4] Update twitcasting extractor --- youtube_dl/extractor/twitcasting.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/twitcasting.py b/youtube_dl/extractor/twitcasting.py index d0bed5b0a..fc1dd0826 100644 --- a/youtube_dl/extractor/twitcasting.py +++ b/youtube_dl/extractor/twitcasting.py @@ -62,20 +62,14 @@ class TwitCastingIE(InfoExtractor): while ' ' in title: title = title.replace(' ', ' ') - # m3u8_url = self._search_regex( - # (r'data-movie-url=(["\'])(?P(?:(?!\1).)+)\1', - # r'(["\'])(?Phttp.+?\.m3u8.*?)\1'), - # webpage, 'm3u8 url', group='url') - # m3u8_url = m3u8_url.replace('\\/', '/') - # formats = self._extract_m3u8_formats( - # m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - # m3u8_id='hls') - formats = [ - { - 'url': "http://dl01.twitcasting.tv/{uploader_id}/download/{video_id}?dl=1".format(uploader_id=uploader_id, video_id=video_id), - 'ext': 'mp4', - } - ] + m3u8_url = self._search_regex( + (r'data-movie-url=(["\'])(?P(?:(?!\1).)+)\1', + r'(["\'])(?Phttp.+?\.m3u8.*?)\1'), + webpage, 'm3u8 url', group='url') + m3u8_url = m3u8_url.replace('\\/', '/') + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls') thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description( From 754d90c9a2c000e2f1bc6a89d78319fd77257f1f Mon Sep 17 00:00:00 2001 From: Yarn Date: Sat, 19 Sep 2020 03:24:10 -0700 Subject: [PATCH 3/4] Resolve issues with twitcasting extractor. --- youtube_dl/extractor/twitcasting.py | 37 +++++++++++++++++++---------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/twitcasting.py b/youtube_dl/extractor/twitcasting.py index fc1dd0826..23b8542b7 100644 --- a/youtube_dl/extractor/twitcasting.py +++ b/youtube_dl/extractor/twitcasting.py @@ -53,10 +53,10 @@ class TwitCastingIE(InfoExtractor): }) webpage = self._download_webpage(url, video_id, data=request_data) - title = self._html_search_regex( + title = (self._html_search_regex( r'(?s)<[^>]+id=["\']movietitle[^>]+>(.+?)(.*)', webpage) - title = re.search(r'(?s)<[^>]+class=["\']tw-user-nav-name[^>]+>(.+?)]+class=["\']tw-user-nav-name[^>]+>(.+?)1') != -1 @@ -137,13 +145,16 @@ class TwitCastingHistoryIE(InfoExtractor): if locked is not None: continue - title = re.search(r'''<[^>]+class=["']tw-movie-thumbnail-title[^>]+>[ \n]*?(.+?) *?]+class=["']tw-movie-thumbnail-title[^>]+>[ \n]*?(.+?) *? Date: Sat, 19 Sep 2020 04:29:06 -0700 Subject: [PATCH 4/4] satisfy flake8 --- youtube_dl/extractor/twitcasting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitcasting.py b/youtube_dl/extractor/twitcasting.py index 23b8542b7..6431e58fb 100644 --- a/youtube_dl/extractor/twitcasting.py +++ b/youtube_dl/extractor/twitcasting.py @@ -154,7 +154,7 @@ class TwitCastingHistoryIE(InfoExtractor): video_url = 'https://twitcasting.tv%s' % href video_id = href.split('/')[-1] result = self.url_result(video_url, - ie=TwitCastingIE.ie_key(), video_id=video_id, video_title=title) + ie=TwitCastingIE.ie_key(), video_id=video_id, video_title=title) yield result def _real_extract(self, url):