[YoutubeDL] rework how the format spec is processed

The spec string is processed using 'tokenize.tokenize' to split it in words and operators, the filters are still processed using regular expressions. This should make easier to allow grouping operators with parens.
2025-02-18 18:17:55 +01:00 · 2015-06-28 22:08:29 +02:00 · 2015-06-28 22:08:29 +02:00 · 67134eaba1
commit 67134eaba1
parent ac0474f89d
3 changed files with 209 additions and 121 deletions
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@ -229,21 +229,30 @@ class TestFormatSelection(unittest.TestCase):
            '141', '172', '140', '171', '139',
        ]
-        for f1id, f2id in zip(order, order[1:]):
+        def format_info(f_id):
-            f1 = YoutubeIE._formats[f1id].copy()
+            info = YoutubeIE._formats[f_id].copy()
-            f1['format_id'] = f1id
+            info['format_id'] = f_id
-            f1['url'] = 'url:' + f1id
+            info['url'] = 'url:' + f_id
-            f2 = YoutubeIE._formats[f2id].copy()
+            return info
-            f2['format_id'] = f2id
+        formats_order = [format_info(f_id) for f_id in order]
            f2['url'] = 'url:' + f2id
        info_dict = _make_result(list(formats_order), extractor='youtube')
        ydl = YDL({'format': 'bestvideo+bestaudio'})
        yie = YoutubeIE(ydl)
        yie._sort_formats(info_dict['formats'])
        ydl.process_ie_result(info_dict)
        downloaded = ydl.downloaded_info_dicts[0]
        self.assertEqual(downloaded['format_id'], '137+141')
        self.assertEqual(downloaded['ext'], 'mp4')
        for f1, f2 in zip(formats_order, formats_order[1:]):
            info_dict = _make_result([f1, f2], extractor='youtube')
            ydl = YDL({'format': 'best/bestvideo'})
            yie = YoutubeIE(ydl)
            yie._sort_formats(info_dict['formats'])
            ydl.process_ie_result(info_dict)
            downloaded = ydl.downloaded_info_dicts[0]
-            self.assertEqual(downloaded['format_id'], f1id)
+            self.assertEqual(downloaded['format_id'], f1['format_id'])
            info_dict = _make_result([f2, f1], extractor='youtube')
            ydl = YDL({'format': 'best/bestvideo'})
@ -251,7 +260,7 @@ class TestFormatSelection(unittest.TestCase):
            yie._sort_formats(info_dict['formats'])
            ydl.process_ie_result(info_dict)
            downloaded = ydl.downloaded_info_dicts[0]
-            self.assertEqual(downloaded['format_id'], f1id)
+            self.assertEqual(downloaded['format_id'], f1['format_id'])
    def test_format_filtering(self):
        formats = [
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -21,6 +21,7 @@ import subprocess
 import socket
 import sys
 import time
 import tokenize
 import traceback
 if os.name == 'nt':
@ -34,6 +35,7 @@ from .compat import (
    compat_http_client,
    compat_kwargs,
    compat_str,
    compat_tokenize_tokenize,
    compat_urllib_error,
    compat_urllib_request,
 )
@ -851,8 +853,8 @@ class YoutubeDL(object):
        else:
            raise Exception('Invalid result type: %s' % result_type)
-    def _apply_format_filter(self, format_spec, available_formats):
+    def _build_format_filter(self, filter_spec):
-        " Returns a tuple of the remaining format_spec and filtered formats "
+        " Returns a function to filter the formats according to the filter_spec "
        OPERATORS = {
            '<': operator.lt,
@ -862,13 +864,13 @@ class YoutubeDL(object):
            '=': operator.eq,
            '!=': operator.ne,
        }
-        operator_rex = re.compile(r'''(?x)\s*\[
+        operator_rex = re.compile(r'''(?x)\s*
            (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
            \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
            (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
-            \]$
+            $
            ''' % '|'.join(map(re.escape, OPERATORS.keys())))
-        m = operator_rex.search(format_spec)
+        m = operator_rex.search(filter_spec)
        if m:
            try:
                comparison_value = int(m.group('value'))
@ -879,7 +881,7 @@ class YoutubeDL(object):
                if comparison_value is None:
                    raise ValueError(
                        'Invalid value %r in format specification %r' % (
-                            m.group('value'), format_spec))
+                            m.group('value'), filter_spec))
            op = OPERATORS[m.group('op')]
        if not m:
@ -887,85 +889,201 @@ class YoutubeDL(object):
                '=': operator.eq,
                '!=': operator.ne,
            }
-            str_operator_rex = re.compile(r'''(?x)\s*\[
+            str_operator_rex = re.compile(r'''(?x)
                \s*(?P<key>ext|acodec|vcodec|container|protocol)
                \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
                \s*(?P<value>[a-zA-Z0-9_-]+)
-                \s*\]$
+                \s*$
                ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
-            m = str_operator_rex.search(format_spec)
+            m = str_operator_rex.search(filter_spec)
            if m:
                comparison_value = m.group('value')
                op = STR_OPERATORS[m.group('op')]
        if not m:
-            raise ValueError('Invalid format specification %r' % format_spec)
+            raise ValueError('Invalid filter specification %r' % filter_spec)
        def _filter(f):
            actual_value = f.get(m.group('key'))
            if actual_value is None:
                return m.group('none_inclusive')
            return op(actual_value, comparison_value)
-        new_formats = [f for f in available_formats if _filter(f)]
+        return _filter
-        new_format_spec = format_spec[:-len(m.group(0))]
+    def build_format_selector(self, format_spec):
-        if not new_format_spec:
+        def syntax_error(note, start):
-            new_format_spec = 'best'
+            message = (
                'Invalid format specification: '
                '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
            return SyntaxError(message)
-        return (new_format_spec, new_formats)
+        PICKFIRST = 'PICKFIRST'
        MERGE = 'MERGE'
        SINGLE = 'SINGLE'
        FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
-    def select_format(self, format_spec, available_formats):
+        def _parse_filter(tokens):
-        while format_spec.endswith(']'):
+            filter_parts = []
-            format_spec, available_formats = self._apply_format_filter(
+            for type, string, start, _, _ in tokens:
-                format_spec, available_formats)
+                if type == tokenize.OP and string == ']':
-        if not available_formats:
+                    return ''.join(filter_parts)
-            return None
+                else:
                    filter_parts.append(string)
-        if format_spec in ['best', 'worst', None]:
+        def _parse_format_selection(tokens, endwith=[]):
-            format_idx = 0 if format_spec == 'worst' else -1
+            selectors = []
-            audiovideo_formats = [
+            current_selector = None
-                f for f in available_formats
+            for type, string, start, _, _ in tokens:
-                if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
+                # ENCODING is only defined in python 3.x
-            if audiovideo_formats:
+                if type == getattr(tokenize, 'ENCODING', None):
-                return audiovideo_formats[format_idx]
+                    continue
-            # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
+                elif type in [tokenize.NAME, tokenize.NUMBER]:
-            elif (all(f.get('acodec') != 'none' for f in available_formats) or
+                    current_selector = FormatSelector(SINGLE, string, [])
-                  all(f.get('vcodec') != 'none' for f in available_formats)):
+                elif type == tokenize.OP:
-                return available_formats[format_idx]
+                    if string in endwith:
-        elif format_spec == 'bestaudio':
+                        break
-            audio_formats = [
+                    if string == ',':
-                f for f in available_formats
+                        selectors.append(current_selector)
-                if f.get('vcodec') == 'none']
+                        current_selector = None
-            if audio_formats:
+                    elif string == '/':
-                return audio_formats[-1]
+                        first_choice = current_selector
-        elif format_spec == 'worstaudio':
+                        second_choice = _parse_format_selection(tokens, [','])
-            audio_formats = [
+                        current_selector = None
-                f for f in available_formats
+                        selectors.append(FormatSelector(PICKFIRST, (first_choice, second_choice), []))
-                if f.get('vcodec') == 'none']
+                    elif string == '[':
-            if audio_formats:
+                        if not current_selector:
-                return audio_formats[0]
+                            current_selector = FormatSelector(SINGLE, 'best', [])
-        elif format_spec == 'bestvideo':
+                        format_filter = _parse_filter(tokens)
-            video_formats = [
+                        current_selector.filters.append(format_filter)
-                f for f in available_formats
+                    elif string == '+':
-                if f.get('acodec') == 'none']
+                        video_selector = current_selector
-            if video_formats:
+                        audio_selector = _parse_format_selection(tokens, [','])
-                return video_formats[-1]
+                        current_selector = None
-        elif format_spec == 'worstvideo':
+                        selectors.append(FormatSelector(MERGE, (video_selector, audio_selector), []))
-            video_formats = [
+                    else:
-                f for f in available_formats
+                        raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
-                if f.get('acodec') == 'none']
+                elif type == tokenize.ENDMARKER:
-            if video_formats:
+                    break
-                return video_formats[0]
+            if current_selector:
-        else:
+                selectors.append(current_selector)
-            extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
+            return selectors
-            if format_spec in extensions:
+
-                filter_f = lambda f: f['ext'] == format_spec
+        def _build_selector_function(selector):
-            else:
+            if isinstance(selector, list):
-                filter_f = lambda f: f['format_id'] == format_spec
+                fs = [_build_selector_function(s) for s in selector]
-            matches = list(filter(filter_f, available_formats))
+
-            if matches:
+                def selector_function(formats):
-                return matches[-1]
+                    for f in fs:
-        return None
+                        for format in f(formats):
                            yield format
                return selector_function
            elif selector.type == PICKFIRST:
                fs = [_build_selector_function(s) for s in selector.selector]
                def selector_function(formats):
                    for f in fs:
                        picked_formats = list(f(formats))
                        if picked_formats:
                            return picked_formats
                    return []
            elif selector.type == SINGLE:
                format_spec = selector.selector
                def selector_function(formats):
                    if format_spec in ['best', 'worst', None]:
                        format_idx = 0 if format_spec == 'worst' else -1
                        audiovideo_formats = [
                            f for f in formats
                            if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
                        if audiovideo_formats:
                            yield audiovideo_formats[format_idx]
                        # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
                        elif (all(f.get('acodec') != 'none' for f in formats) or
                              all(f.get('vcodec') != 'none' for f in formats)):
                            yield formats[format_idx]
                    elif format_spec == 'bestaudio':
                        audio_formats = [
                            f for f in formats
                            if f.get('vcodec') == 'none']
                        if audio_formats:
                            yield audio_formats[-1]
                    elif format_spec == 'worstaudio':
                        audio_formats = [
                            f for f in formats
                            if f.get('vcodec') == 'none']
                        if audio_formats:
                            yield audio_formats[0]
                    elif format_spec == 'bestvideo':
                        video_formats = [
                            f for f in formats
                            if f.get('acodec') == 'none']
                        if video_formats:
                            yield video_formats[-1]
                    elif format_spec == 'worstvideo':
                        video_formats = [
                            f for f in formats
                            if f.get('acodec') == 'none']
                        if video_formats:
                            yield video_formats[0]
                    else:
                        extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
                        if format_spec in extensions:
                            filter_f = lambda f: f['ext'] == format_spec
                        else:
                            filter_f = lambda f: f['format_id'] == format_spec
                        matches = list(filter(filter_f, formats))
                        if matches:
                            yield matches[-1]
            elif selector.type == MERGE:
                def _merge(formats_info):
                    format_1, format_2 = [f['format_id'] for f in formats_info]
                    # The first format must contain the video and the
                    # second the audio
                    if formats_info[0].get('vcodec') == 'none':
                        self.report_error('The first format must '
                                          'contain the video, try using '
                                          '"-f %s+%s"' % (format_2, format_1))
                        return
                    output_ext = (
                        formats_info[0]['ext']
                        if self.params.get('merge_output_format') is None
                        else self.params['merge_output_format'])
                    return {
                        'requested_formats': formats_info,
                        'format': '%s+%s' % (formats_info[0].get('format'),
                                             formats_info[1].get('format')),
                        'format_id': '%s+%s' % (formats_info[0].get('format_id'),
                                                formats_info[1].get('format_id')),
                        'width': formats_info[0].get('width'),
                        'height': formats_info[0].get('height'),
                        'resolution': formats_info[0].get('resolution'),
                        'fps': formats_info[0].get('fps'),
                        'vcodec': formats_info[0].get('vcodec'),
                        'vbr': formats_info[0].get('vbr'),
                        'stretched_ratio': formats_info[0].get('stretched_ratio'),
                        'acodec': formats_info[1].get('acodec'),
                        'abr': formats_info[1].get('abr'),
                        'ext': output_ext,
                    }
                video_selector, audio_selector = map(_build_selector_function, selector.selector)
                def selector_function(formats):
                    formats = list(formats)
                    for pair in itertools.product(video_selector(formats), audio_selector(formats)):
                        yield _merge(pair)
            filters = [self._build_format_filter(f) for f in selector.filters]
            def final_selector(formats):
                for _filter in filters:
                    formats = list(filter(_filter, formats))
                return selector_function(formats)
            return final_selector
        stream = io.BytesIO(format_spec.encode('utf-8'))
        tokens = compat_tokenize_tokenize(stream.readline)
        parsed_selector = _parse_format_selection(tokens)
        return _build_selector_function(parsed_selector)
    def _calc_headers(self, info_dict):
        res = std_headers.copy()
@ -1112,52 +1230,8 @@ class YoutubeDL(object):
        if req_format == 'all':
            formats_to_download = formats
        else:
-            for rfstr in req_format.split(','):
+            format_selector = self.build_format_selector(req_format)
-                # We can accept formats requested in the format: 34/5/best, we pick
+            formats_to_download = list(format_selector(formats))
                # the first that is available, starting from left
                req_formats = rfstr.split('/')
                for rf in req_formats:
                    if re.match(r'.+?\+.+?', rf) is not None:
                        # Two formats have been requested like '137+139'
                        format_1, format_2 = rf.split('+')
                        formats_info = (self.select_format(format_1, formats),
                                        self.select_format(format_2, formats))
                        if all(formats_info):
                            # The first format must contain the video and the
                            # second the audio
                            if formats_info[0].get('vcodec') == 'none':
                                self.report_error('The first format must '
                                                  'contain the video, try using '
                                                  '"-f %s+%s"' % (format_2, format_1))
                                return
                            output_ext = (
                                formats_info[0]['ext']
                                if self.params.get('merge_output_format') is None
                                else self.params['merge_output_format'])
                            selected_format = {
                                'requested_formats': formats_info,
                                'format': '%s+%s' % (formats_info[0].get('format'),
                                                     formats_info[1].get('format')),
                                'format_id': '%s+%s' % (formats_info[0].get('format_id'),
                                                        formats_info[1].get('format_id')),
                                'width': formats_info[0].get('width'),
                                'height': formats_info[0].get('height'),
                                'resolution': formats_info[0].get('resolution'),
                                'fps': formats_info[0].get('fps'),
                                'vcodec': formats_info[0].get('vcodec'),
                                'vbr': formats_info[0].get('vbr'),
                                'stretched_ratio': formats_info[0].get('stretched_ratio'),
                                'acodec': formats_info[1].get('acodec'),
                                'abr': formats_info[1].get('abr'),
                                'ext': output_ext,
                            }
                        else:
                            selected_format = None
                    else:
                        selected_format = self.select_format(rf, formats)
                    if selected_format is not None:
                        formats_to_download.append(selected_format)
                        break
        if not formats_to_download:
            raise ExtractorError('requested format not available',
                                 expected=True)
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@ -388,6 +388,10 @@ else:
            pass
        return _terminal_size(columns, lines)
 if sys.version_info >= (3, 0):
    from tokenize import tokenize as compat_tokenize_tokenize
 else:
    from tokenize import generate_tokens as compat_tokenize_tokenize
 __all__ = [
    'compat_HTTPError',
@ -408,6 +412,7 @@ __all__ = [
    'compat_socket_create_connection',
    'compat_str',
    'compat_subprocess_get_DEVNULL',
    'compat_tokenize_tokenize',
    'compat_urllib_error',
    'compat_urllib_parse',
    'compat_urllib_parse_unquote',