[utils] Fix handling of comments in js_to_json (closes #23707, closes #23785)

2025-01-07 13:47:54 +01:00 · 2020-01-28 22:37:29 +01:00 · 2020-01-28 22:37:29 +01:00 · 13a91f1642
commit 13a91f1642
parent 51c7f40c83
2 changed files with 41 additions and 9 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -918,6 +918,34 @@ class TestUtil(unittest.TestCase):
        inp = '''{segments: [{"offset":-3.885780586188048e-16,"duration":39.75000000000001}]}'''
        self.assertEqual(js_to_json(inp), '''{"segments": [{"offset":-3.885780586188048e-16,"duration":39.75000000000001}]}''')

+        inp = '''{
+            foo: "value",
+            // bar: { nested:'x' },
+            bar: { nested:'x' },
+            chaff: "something"
+        }'''
+        self.assertEqual(js_to_json(inp), '''{
+            "foo": "value",
+
+            "bar": { "nested":"x" },
+            "chaff": "something"
+        }''')
+
+        inp = '''{
+            id: "player_prog",
+            googleCast: true,
+            //extraSettings: { googleCastReceiverAppId:'1A6F2224', skin:'s3',  skinAccentColor: '0073FF'},
+            extraSettings: { googleCastReceiverAppId:'1A6F2224'},
+            mediaType: "video",
+        }'''
+        self.assertEqual(js_to_json(inp), '''{
+            "id": "player_prog",
+            "googleCast": true,
+
+            "extraSettings": { "googleCastReceiverAppId":"1A6F2224"},
+            "mediaType": "video"
+        }''')
+
    def test_js_to_json_edgecases(self):
        on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}")
        self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"})
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -3975,18 +3975,22 @@ def strip_jsonp(code):


 def js_to_json(code):
-    COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
-    SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
    INTEGER_TABLE = (
-        (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
-        (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
+        (r'(?s)^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
+        (r'(?s)^(0+[0-7]+)\s*:?$', 8),
    )

+    # Remove all comments first, including all whitespace leading up to them.
+    # This regular expression is based on this Stack Overflow answer:
+    #   https://stackoverflow.com/a/25735600
+    code = re.sub(r'("(?:[^"\\]|\\[\s\S])*"|\'(?:[^\'\\]|\\[\s\S])*\')|[ \t]*//.*|[ \t]*/\*(?:[^*]|\*(?!/))*\*/',
+                  '\\1', code)
+
    def fix_kv(m):
        v = m.group(0)
        if v in ('true', 'false', 'null'):
            return v
-        elif v.startswith('/*') or v.startswith('//') or v == ',':
+        elif v == ',':
            return ""

        if v[0] in ("'", '"'):
@ -4008,11 +4012,11 @@ def js_to_json(code):
    return re.sub(r'''(?sx)
        "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
        '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
-        {comment}|,(?={skip}[\]}}])|
+        ,(?=\s*[\]}}])|
        (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
-        \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
-        [0-9]+(?={skip}:)
-        '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
+        \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
+        [0-9]+(?=\s*:)
+        ''', fix_kv, code)


 def qualities(quality_ids):