diff --git a/test/test_compat.py b/test/test_compat.py index 86ff389fd..6adcf436e 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -21,6 +21,7 @@ from youtube_dl.compat import ( compat_struct_unpack, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, + compat_urllib_parse_quote, compat_urllib_parse_urlencode, ) @@ -76,6 +77,15 @@ class TestCompat(unittest.TestCase): self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def') self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def') + def test_compat_urllib_parse_quote(self): + self.assertEqual(compat_urllib_parse_quote('abc def'), 'abc%20def') + self.assertEqual(compat_urllib_parse_quote('~/abc+def'), '~/abc%2Bdef') + self.assertEqual(compat_urllib_parse_quote(''), '') + self.assertEqual(compat_urllib_parse_quote('%'), '%25') + self.assertEqual(compat_urllib_parse_quote('%%'), '%25%25') + self.assertEqual(compat_urllib_parse_quote('%%%'), '%25%25%25') + self.assertEqual(compat_urllib_parse_quote('/'), '/') + def test_compat_urllib_parse_urlencode(self): self.assertEqual(compat_urllib_parse_urlencode({'abc': 'def'}), 'abc=def') self.assertEqual(compat_urllib_parse_urlencode({'abc': b'def'}), 'abc=def') diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index c75ab131b..be8425060 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2358,10 +2358,23 @@ try: from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus + from urllib.parse import quote as _compat_urllib_parse_quote + + def compat_urllib_parse_quote(string, safe='~/', encoding=None, errors=None): + return _compat_urllib_parse_quote(string, safe, encoding, errors) + except ImportError: # Python 2 _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') else re.compile(r'([\x00-\x7f]+)')) + _always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' '_.-') + _safe_map = {} + for i, c in zip(xrange(256), str(bytearray(xrange(256)))): + _safe_map[c] = c if (i < 128 and c in _always_safe) else '%{0:02X}'.format(i) + _safe_quoters = {} + # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus # implementations from cpython 3.4.3's stdlib. Python 2's version # is apparently broken (see https://github.com/ytdl-org/youtube-dl/pull/6244) @@ -2424,6 +2437,45 @@ except ImportError: # Python 2 string = string.replace('+', ' ') return compat_urllib_parse_unquote(string, encoding, errors) + def compat_urllib_parse_quote(s, safe='~/'): + """quote('abc def') -> 'abc%20def' + + Each part of a URL, e.g. the path info, the query, etc., has a + different set of reserved characters that must be quoted. + + RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists + the following reserved characters. + + reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | + "$" | "," + + Each of these characters is reserved in some component of a URL, + but not necessarily in all of them. + + By default, the quote function is intended for quoting the path + section of a URL. Thus, it will not encode '/'. This character + is reserved, but in typical usage the quote function is being + called on a path where the existing slash characters are used as + reserved characters. + """ + # fastpath + if not s: + if s is None: + raise TypeError('None object cannot be quoted') + return s + cachekey = (safe, _always_safe) + try: + (quoter, safe) = _safe_quoters[cachekey] + except KeyError: + safe_map = _safe_map.copy() + safe_map.update([(c, c) for c in safe]) + quoter = safe_map.__getitem__ + safe = _always_safe + safe + _safe_quoters[cachekey] = (quoter, safe) + if not s.rstrip(safe): + return s + return ''.join(map(quoter, s)) + try: from urllib.parse import urlencode as compat_urllib_parse_urlencode except ImportError: # Python 2 @@ -3009,6 +3061,7 @@ __all__ = [ 'compat_tokenize_tokenize', 'compat_urllib_error', 'compat_urllib_parse', + 'compat_urllib_parse_quote', 'compat_urllib_parse_unquote', 'compat_urllib_parse_unquote_plus', 'compat_urllib_parse_unquote_to_bytes', diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ec43c5ae4..26d58f0be 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -12,6 +12,7 @@ from ..compat import ( compat_etree_fromstring, compat_str, compat_urllib_parse_unquote, + compat_urllib_parse_quote, compat_urlparse, compat_xml_parse_error, ) @@ -2405,6 +2406,13 @@ class GenericIE(InfoExtractor): if camtasia_res is not None: return camtasia_res + # We don't want strings to be unescaped, so escape them + # in order to transparently pass through the next unquote + # see https://github.com/ytdl-org/youtube-dl/issues/22704 + webpage = re.sub( + "\"(.*?)\"", + lambda x: "\"" + compat_urllib_parse_quote(x.group(1)) + "\"", + webpage) # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) # Unescaping the whole page allows to handle those cases in a generic way