[ytsearch] Fix extraction (closes #26920 )

[afreecatv] Fix typo (#26970 )
[23video] Relax _VALID_URL (#26870 )
2020-10-23 21:31:37 +07:00 · 2020-10-22 19:15:05 +07:00 · 2020-10-20 00:56:23 +07:00 · 2020-10-18 00:10:41 +07:00 · 2020-10-17 23:14:46 +07:00 · 2020-10-17 23:02:17 +07:00
11 changed files with 117 additions and 60 deletions
--- a/README.md
+++ b/README.md
@ -545,7 +545,7 @@ The basic usage is not to set any template arguments when downloading a single f
 - `extractor` (string): Name of the extractor
 - `extractor_key` (string): Key name of the extractor
 - `epoch` (numeric): Unix epoch when creating the file
- - `autonumber` (numeric): Five-digit number that will be increased with each download, starting at zero
+ - `autonumber` (numeric): Number that will be increased with each download, starting at `--autonumber-start`
 - `playlist` (string): Name or id of the playlist that contains the video
 - `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according to the total length of the playlist
 - `playlist_id` (string): Playlist identifier
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -994,6 +994,12 @@ class TestUtil(unittest.TestCase):
        on = js_to_json('{42:4.2e1}')
        self.assertEqual(json.loads(on), {'42': 42.0})

+        on = js_to_json('{ "0x40": "0x40" }')
+        self.assertEqual(json.loads(on), {'0x40': '0x40'})
+
+        on = js_to_json('{ "040": "040" }')
+        self.assertEqual(json.loads(on), {'040': '040'})
+
    def test_js_to_json_malformed(self):
        self.assertEqual(js_to_json('42a1'), '42"a1"')
        self.assertEqual(js_to_json('42a-1'), '42"a"-1')
--- a/youtube_dl/downloader/http.py
+++ b/youtube_dl/downloader/http.py
@ -223,9 +223,10 @@ class HttpFD(FileDownloader):

            def retry(e):
                to_stdout = ctx.tmpfilename == '-'
-                if not to_stdout:
-                    ctx.stream.close()
-                ctx.stream = None
+                if ctx.stream is not None:
+                    if not to_stdout:
+                        ctx.stream.close()
+                    ctx.stream = None
                ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename))
                raise RetryDownload(e)

@ -240,7 +241,7 @@ class HttpFD(FileDownloader):
                except socket.error as e:
                    # SSLError on python 2 (inherits socket.error) may have
                    # no errno set but this error message
-                    if e.errno in (errno.ECONNRESET, errno.ETIMEDOUT) or getattr(e, 'message') == 'The read operation timed out':
+                    if e.errno in (errno.ECONNRESET, errno.ETIMEDOUT) or getattr(e, 'message', None) == 'The read operation timed out':
                        retry(e)
                    raise

--- a/youtube_dl/extractor/afreecatv.py
+++ b/youtube_dl/extractor/afreecatv.py
@ -275,7 +275,7 @@ class AfreecaTVIE(InfoExtractor):
        video_element = video_xml.findall(compat_xpath('./track/video'))[-1]
        if video_element is None or video_element.text is None:
            raise ExtractorError(
-                'Video %s video does not exist' % video_id, expected=True)
+                'Video %s does not exist' % video_id, expected=True)

        video_url = video_element.text.strip()

--- a/youtube_dl/extractor/expressen.py
+++ b/youtube_dl/extractor/expressen.py
@ -15,7 +15,7 @@ from ..utils import (
 class ExpressenIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                    https?://
-                        (?:www\.)?expressen\.se/
+                        (?:www\.)?(?:expressen|di)\.se/
                        (?:(?:tvspelare/video|videoplayer/embed)/)?
                        tv/(?:[^/]+/)*
                        (?P<id>[^/?#&]+)
@ -42,13 +42,16 @@ class ExpressenIE(InfoExtractor):
    }, {
        'url': 'https://www.expressen.se/videoplayer/embed/tv/ditv/ekonomistudion/experterna-har-ar-fragorna-som-avgor-valet/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di',
        'only_matching': True,
+    }, {
+        'url': 'https://www.di.se/videoplayer/embed/tv/ditv/borsmorgon/implantica-rusar-70--under-borspremiaren-hor-styrelsemedlemmen/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di',
+        'only_matching': True,
    }]

    @staticmethod
    def _extract_urls(webpage):
        return [
            mobj.group('url') for mobj in re.finditer(
-                r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?expressen\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1',
+                r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1',
                webpage)]

    def _real_extract(self, url):
--- a/youtube_dl/extractor/iprima.py
+++ b/youtube_dl/extractor/iprima.py
@ -86,7 +86,8 @@ class IPrimaIE(InfoExtractor):
            (r'<iframe[^>]+\bsrc=["\'](?:https?:)?//(?:api\.play-backend\.iprima\.cz/prehravac/embedded|prima\.iprima\.cz/[^/]+/[^/]+)\?.*?\bid=(p\d+)',
             r'data-product="([^"]+)">',
             r'id=["\']player-(p\d+)"',
-             r'playerId\s*:\s*["\']player-(p\d+)'),
+             r'playerId\s*:\s*["\']player-(p\d+)',
+             r'\bvideos\s*=\s*["\'](p\d+)'),
            webpage, 'real id')

        playerpage = self._download_webpage(
--- a/youtube_dl/extractor/iqiyi.py
+++ b/youtube_dl/extractor/iqiyi.py
@ -150,7 +150,7 @@ class IqiyiSDKInterpreter(object):
            elif function in other_functions:
                other_functions[function]()
            else:
-                raise ExtractorError('Unknown funcion %s' % function)
+                raise ExtractorError('Unknown function %s' % function)

        return sdk.target

--- a/youtube_dl/extractor/twentythreevideo.py
+++ b/youtube_dl/extractor/twentythreevideo.py
@ -8,8 +8,8 @@ from ..utils import int_or_none

 class TwentyThreeVideoIE(InfoExtractor):
    IE_NAME = '23video'
-    _VALID_URL = r'https?://video\.(?P<domain>twentythree\.net|23video\.com|filmweb\.no)/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)'
-    _TEST = {
+    _VALID_URL = r'https?://(?P<domain>[^.]+\.(?:twentythree\.net|23video\.com|filmweb\.no))/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)'
+    _TESTS = [{
        'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1',
        'md5': '75fcf216303eb1dae9920d651f85ced4',
        'info_dict': {
@ -21,11 +21,14 @@ class TwentyThreeVideoIE(InfoExtractor):
            'uploader_id': '12258964',
            'uploader': 'Rasmus Bysted',
        }
-    }
+    }, {
+        'url': 'https://bonnier-publications-danmark.23video.com/v.ihtml/player.html?token=f0dc46476e06e13afd5a1f84a29e31e8&source=embed&photo%5fid=36137620',
+        'only_matching': True,
+    }]

    def _real_extract(self, url):
        domain, query, photo_id = re.match(self._VALID_URL, url).groups()
-        base_url = 'https://video.%s' % domain
+        base_url = 'https://%s' % domain
        photo_data = self._download_json(
            base_url + '/api/photo/list?' + query, photo_id, query={
                'format': 'json',
--- a/youtube_dl/extractor/ustream.py
+++ b/youtube_dl/extractor/ustream.py
@ -19,7 +19,7 @@ from ..utils import (


 class UstreamIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)'
    IE_NAME = 'ustream'
    _TESTS = [{
        'url': 'http://www.ustream.tv/recorded/20274954',
@ -67,12 +67,15 @@ class UstreamIE(InfoExtractor):
        'params': {
            'skip_download': True,  # m3u8 download
        },
+    }, {
+        'url': 'https://video.ibm.com/embed/recorded/128240221?&autoplay=true&controls=true&volume=100',
+        'only_matching': True,
    }]

    @staticmethod
    def _extract_url(webpage):
        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage)
        if mobj is not None:
            return mobj.group('url')

--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -3181,54 +3181,94 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
    _MAX_RESULTS = float('inf')
    IE_NAME = 'youtube:search'
    _SEARCH_KEY = 'ytsearch'
-    _EXTRA_QUERY_ARGS = {}
+    _SEARCH_PARAMS = None
    _TESTS = []

+    def _entries(self, query, n):
+        data = {
+            'context': {
+                'client': {
+                    'clientName': 'WEB',
+                    'clientVersion': '2.20201021.03.00',
+                }
+            },
+            'query': query,
+        }
+        if self._SEARCH_PARAMS:
+            data['params'] = self._SEARCH_PARAMS
+        total = 0
+        for page_num in itertools.count(1):
+            search = self._download_json(
+                'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+                video_id='query "%s"' % query,
+                note='Downloading page %s' % page_num,
+                errnote='Unable to download API page', fatal=False,
+                data=json.dumps(data).encode('utf8'),
+                headers={'content-type': 'application/json'})
+            if not search:
+                break
+            slr_contents = try_get(
+                search,
+                (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
+                 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
+                list)
+            if not slr_contents:
+                break
+            isr_contents = try_get(
+                slr_contents,
+                lambda x: x[0]['itemSectionRenderer']['contents'],
+                list)
+            if not isr_contents:
+                break
+            for content in isr_contents:
+                if not isinstance(content, dict):
+                    continue
+                video = content.get('videoRenderer')
+                if not isinstance(video, dict):
+                    continue
+                video_id = video.get('videoId')
+                if not video_id:
+                    continue
+                title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
+                description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
+                duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
+                view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
+                view_count = int_or_none(self._search_regex(
+                    r'^(\d+)', re.sub(r'\s', '', view_count_text),
+                    'view count', default=None))
+                uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
+                total += 1
+                yield {
+                    '_type': 'url_transparent',
+                    'ie_key': YoutubeIE.ie_key(),
+                    'id': video_id,
+                    'url': video_id,
+                    'title': title,
+                    'description': description,
+                    'duration': duration,
+                    'view_count': view_count,
+                    'uploader': uploader,
+                }
+                if total == n:
+                    return
+            token = try_get(
+                slr_contents,
+                lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
+                compat_str)
+            if not token:
+                break
+            data['continuation'] = token
+
    def _get_n_results(self, query, n):
        """Get a specified number of results for a query"""
-
-        videos = []
-        limit = n
-
-        url_query = {
-            'search_query': query.encode('utf-8'),
-        }
-        url_query.update(self._EXTRA_QUERY_ARGS)
-        result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
-
-        for pagenum in itertools.count(1):
-            data = self._download_json(
-                result_url, video_id='query "%s"' % query,
-                note='Downloading page %s' % pagenum,
-                errnote='Unable to download API page',
-                query={'spf': 'navigate'})
-            html_content = data[1]['body']['content']
-
-            if 'class="search-message' in html_content:
-                raise ExtractorError(
-                    '[youtube] No video results', expected=True)
-
-            new_videos = list(self._process_page(html_content))
-            videos += new_videos
-            if not new_videos or len(videos) > limit:
-                break
-            next_link = self._html_search_regex(
-                r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
-                html_content, 'next link', default=None)
-            if next_link is None:
-                break
-            result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
-
-        if len(videos) > n:
-            videos = videos[:n]
-        return self.playlist_result(videos, query)
+        return self.playlist_result(self._entries(query, n), query)


 class YoutubeSearchDateIE(YoutubeSearchIE):
    IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
    _SEARCH_KEY = 'ytsearchdate'
    IE_DESC = 'YouTube.com searches, newest videos first'
-    _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
+    _SEARCH_PARAMS = 'CAI%3D'


 class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -4088,12 +4088,12 @@ def js_to_json(code):
                '\\\n': '',
                '\\x': '\\u00',
            }.get(m.group(0), m.group(0)), v[1:-1])
-
-        for regex, base in INTEGER_TABLE:
-            im = re.match(regex, v)
-            if im:
-                i = int(im.group(1), base)
-                return '"%d":' % i if v.endswith(':') else '%d' % i
+        else:
+            for regex, base in INTEGER_TABLE:
+                im = re.match(regex, v)
+                if im:
+                    i = int(im.group(1), base)
+                    return '"%d":' % i if v.endswith(':') else '%d' % i

        return '"%s"' % v
Author	SHA1	Message	Date
Sergey M․	416da574ec	[ytsearch] Fix extraction (closes #26920 )	2020-10-23 21:31:37 +07:00
Toan Nguyen	48c5663c5f	[afreecatv] Fix typo (#26970 )	2020-10-22 19:15:05 +07:00
Hannu Hartikainen	7d740e7dc7	[23video] Relax _VALID_URL (#26870 )	2020-10-20 00:56:23 +07:00
Kevin O'Connor	4eda10499e	[utils] Don't attempt to coerce JS strings to numbers in js_to_json (#26851 ) The current logic in `js_to_json` tries to rewrite octal/hex numbers to decimal. However, when the logic actually happens the `"` or `'` have already been trimmed off. This causes what were originally strings, that happen to look like octal/hex numbers, to get rewritten to decimal and returned as a number rather than a string. In practive something like: ```js { "0x40": "foo", "040": "bar", } ``` would get rewritten as: ```json { 64: "foo", 32: "bar } ``` This is problematic since this isn't valid JSON as you cannot have non-string keys.	2020-10-18 00:10:41 +07:00
Sergio Livi	605535776a	[ustream] Add support for video.ibm.com (#26894 )	2020-10-17 23:14:46 +07:00
Felix Yan	1050e0d09f	[iqiyi] Fix typo (#26884 )	2020-10-17 23:02:17 +07:00
Sergey M․	d65d89183f	[expressen] Add support for di.se (closes #26670 )	2020-09-24 07:37:10 +07:00
Surkal	0c92f1e96b	[iprima] Improve video id extraction (#26507 ) (closes #26494 )	2020-09-24 06:46:58 +07:00
Sergey M․	adae9e844b	[README.md] Fix autonumber sequence description (refs #26686 )	2020-09-24 06:36:07 +07:00
Sergey M․	c5764b3f89	[downloader/http] Properly handle missing message in SSLError (closes #26646 )	2020-09-22 07:01:59 +07:00
Sergey M․	0837992a22	[downloader/http] Fix access to not yet opened stream in retry	2020-09-22 06:44:14 +07:00