[utils] Fix handling of comments in js_to_json (closes #23707, closes #23785)

2024-11-22 16:44:32 +01:00 · 2020-01-28 22:37:29 +01:00 · 2020-01-28 22:37:29 +01:00 · 13a91f1642
commit 13a91f1642
parent 51c7f40c83
2 changed files with 41 additions and 9 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -918,6 +918,34 @@ class TestUtil(unittest.TestCase):
        inp = '''{segments: [{"offset":-3.885780586188048e-16,"duration":39.75000000000001}]}'''
        self.assertEqual(js_to_json(inp), '''{"segments": [{"offset":-3.885780586188048e-16,"duration":39.75000000000001}]}''')
        inp = '''{
            foo: "value",
            // bar: { nested:'x' },
            bar: { nested:'x' },
            chaff: "something"
        }'''
        self.assertEqual(js_to_json(inp), '''{
            "foo": "value",
            "bar": { "nested":"x" },
            "chaff": "something"
        }''')
        inp = '''{
            id: "player_prog",
            googleCast: true,
            //extraSettings: { googleCastReceiverAppId:'1A6F2224', skin:'s3',  skinAccentColor: '0073FF'},
            extraSettings: { googleCastReceiverAppId:'1A6F2224'},
            mediaType: "video",
        }'''
        self.assertEqual(js_to_json(inp), '''{
            "id": "player_prog",
            "googleCast": true,
            "extraSettings": { "googleCastReceiverAppId":"1A6F2224"},
            "mediaType": "video"
        }''')
    def test_js_to_json_edgecases(self):
        on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}")
        self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"})
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -3975,18 +3975,22 @@ def strip_jsonp(code):
 def js_to_json(code):
    COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
    SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
    INTEGER_TABLE = (
-        (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
+        (r'(?s)^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
-        (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
+        (r'(?s)^(0+[0-7]+)\s*:?$', 8),
    )
    # Remove all comments first, including all whitespace leading up to them.
    # This regular expression is based on this Stack Overflow answer:
    #   https://stackoverflow.com/a/25735600
    code = re.sub(r'("(?:[^"\\]|\\[\s\S])*"|\'(?:[^\'\\]|\\[\s\S])*\')|[ \t]*//.*|[ \t]*/\*(?:[^*]|\*(?!/))*\*/',
                  '\\1', code)
    def fix_kv(m):
        v = m.group(0)
        if v in ('true', 'false', 'null'):
            return v
-        elif v.startswith('/*') or v.startswith('//') or v == ',':
+        elif v == ',':
            return ""
        if v[0] in ("'", '"'):
@ -4008,11 +4012,11 @@ def js_to_json(code):
    return re.sub(r'''(?sx)
        "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
        '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
-        {comment}|,(?={skip}[\]}}])|
+        ,(?=\s*[\]}}])|
        (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
-        \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
+        \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
-        [0-9]+(?={skip}:)
+        [0-9]+(?=\s*:)
-        '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
+        ''', fix_kv, code)
 def qualities(quality_ids):