From 5c62bedd6b6287429bd49d440ef0a020b7d66235 Mon Sep 17 00:00:00 2001 From: uno20001 Date: Fri, 24 Jan 2020 22:57:45 +0100 Subject: [PATCH] [utils.py:js_to_json] add support for octal escape sequences --- test/test_utils.py | 3 +++ youtube_dl/utils.py | 27 +++++++++++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 0896f4150..72157dab6 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -918,6 +918,9 @@ class TestUtil(unittest.TestCase): inp = '''{segments: [{"offset":-3.885780586188048e-16,"duration":39.75000000000001}]}''' self.assertEqual(js_to_json(inp), '''{"segments": [{"offset":-3.885780586188048e-16,"duration":39.75000000000001}]}''') + inp = '''{label: "Fran\347ais"}''' + self.assertEqual(js_to_json(inp), '''{"label": "Fran\u00e7ais"}''') + def test_js_to_json_edgecases(self): on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}") self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"}) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f6204692a..8ed94d923 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3982,20 +3982,31 @@ def js_to_json(code): (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8), ) + def convert_escapes(m): + # convert Javascript's octal escape sequences (and '\0') + # into valid JSON escape sequences (e.g. '\347' => '\u00e7', '\0' => '\u0000') + if m.group(1): + return "\\u%04x" % int(m.group(1), 8) + + # convert the remaining escape sequences + # into valid JSON + return { + '"': '\\"', + "\\'": "'", + '\\\n': '', + '\\x': '\\u00', + }.get(m.group(0), m.group(0)) + def fix_kv(m): v = m.group(0) + if v in ('true', 'false', 'null'): return v elif v.startswith('/*') or v.startswith('//') or v == ',': return "" if v[0] in ("'", '"'): - v = re.sub(r'(?s)\\.|"', lambda m: { - '"': '\\"', - "\\'": "'", - '\\\n': '', - '\\x': '\\u00', - }.get(m.group(0), m.group(0)), v[1:-1]) + v = re.sub(r'(?s)\\(?:([0-7]{1,3})|.)|"', convert_escapes, v[1:-1]) for regex, base in INTEGER_TABLE: im = re.match(regex, v) @@ -4006,8 +4017,8 @@ def js_to_json(code): return '"%s"' % v return re.sub(r'''(?sx) - "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| - '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| + "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n01234567]))*[^"\\]*"| + '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n01234567]))*[^'\\]*'| {comment}|,(?={skip}[\]}}])| (?:(?