[generic] fix url src's unescape(closes ytdl-org#22704)

2024-11-22 16:44:32 +01:00 · 2019-10-14 16:53:36 +03:00 · 2019-10-14 16:53:36 +03:00 · d12babb5d9
commit d12babb5d9
parent c317b6163b
3 changed files with 71 additions and 0 deletions
--- a/test/test_compat.py
+++ b/test/test_compat.py
@ -21,6 +21,7 @@ from youtube_dl.compat import (
    compat_struct_unpack,
    compat_urllib_parse_unquote,
    compat_urllib_parse_unquote_plus,
+    compat_urllib_parse_quote,
    compat_urllib_parse_urlencode,
 )

@ -76,6 +77,15 @@ class TestCompat(unittest.TestCase):
        self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def')
        self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def')

+    def test_compat_urllib_parse_quote(self):
+        self.assertEqual(compat_urllib_parse_quote('abc def'), 'abc%20def')
+        self.assertEqual(compat_urllib_parse_quote('~/abc+def'), '~/abc%2Bdef')
+        self.assertEqual(compat_urllib_parse_quote(''), '')
+        self.assertEqual(compat_urllib_parse_quote('%'), '%25')
+        self.assertEqual(compat_urllib_parse_quote('%%'), '%25%25')
+        self.assertEqual(compat_urllib_parse_quote('%%%'), '%25%25%25')
+        self.assertEqual(compat_urllib_parse_quote('/'), '/')
+
    def test_compat_urllib_parse_urlencode(self):
        self.assertEqual(compat_urllib_parse_urlencode({'abc': 'def'}), 'abc=def')
        self.assertEqual(compat_urllib_parse_urlencode({'abc': b'def'}), 'abc=def')
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@ -2358,10 +2358,23 @@ try:
    from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes
    from urllib.parse import unquote as compat_urllib_parse_unquote
    from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus
+    from urllib.parse import quote as _compat_urllib_parse_quote
+
+    def compat_urllib_parse_quote(string, safe='~/', encoding=None, errors=None):
+        return _compat_urllib_parse_quote(string, safe, encoding, errors)
+
 except ImportError:  # Python 2
    _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire')
                else re.compile(r'([\x00-\x7f]+)'))

+    _always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+                   'abcdefghijklmnopqrstuvwxyz'
+                   '0123456789' '_.-')
+    _safe_map = {}
+    for i, c in zip(xrange(256), str(bytearray(xrange(256)))):
+        _safe_map[c] = c if (i < 128 and c in _always_safe) else '%{0:02X}'.format(i)
+    _safe_quoters = {}
+
    # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus
    # implementations from cpython 3.4.3's stdlib. Python 2's version
    # is apparently broken (see https://github.com/ytdl-org/youtube-dl/pull/6244)
@ -2424,6 +2437,45 @@ except ImportError:  # Python 2
        string = string.replace('+', ' ')
        return compat_urllib_parse_unquote(string, encoding, errors)

+    def compat_urllib_parse_quote(s, safe='~/'):
+        """quote('abc def') -> 'abc%20def'
+
+        Each part of a URL, e.g. the path info, the query, etc., has a
+        different set of reserved characters that must be quoted.
+
+        RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
+        the following reserved characters.
+
+        reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
+                      "$" | ","
+
+        Each of these characters is reserved in some component of a URL,
+        but not necessarily in all of them.
+
+        By default, the quote function is intended for quoting the path
+        section of a URL.  Thus, it will not encode '/'.  This character
+        is reserved, but in typical usage the quote function is being
+        called on a path where the existing slash characters are used as
+        reserved characters.
+        """
+        # fastpath
+        if not s:
+            if s is None:
+                raise TypeError('None object cannot be quoted')
+            return s
+        cachekey = (safe, _always_safe)
+        try:
+            (quoter, safe) = _safe_quoters[cachekey]
+        except KeyError:
+            safe_map = _safe_map.copy()
+            safe_map.update([(c, c) for c in safe])
+            quoter = safe_map.__getitem__
+            safe = _always_safe + safe
+            _safe_quoters[cachekey] = (quoter, safe)
+        if not s.rstrip(safe):
+            return s
+        return ''.join(map(quoter, s))
+
 try:
    from urllib.parse import urlencode as compat_urllib_parse_urlencode
 except ImportError:  # Python 2
@ -3009,6 +3061,7 @@ __all__ = [
    'compat_tokenize_tokenize',
    'compat_urllib_error',
    'compat_urllib_parse',
+    'compat_urllib_parse_quote',
    'compat_urllib_parse_unquote',
    'compat_urllib_parse_unquote_plus',
    'compat_urllib_parse_unquote_to_bytes',
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -12,6 +12,7 @@ from ..compat import (
    compat_etree_fromstring,
    compat_str,
    compat_urllib_parse_unquote,
+    compat_urllib_parse_quote,
    compat_urlparse,
    compat_xml_parse_error,
 )
@ -2405,6 +2406,13 @@ class GenericIE(InfoExtractor):
        if camtasia_res is not None:
            return camtasia_res

+        # We don't want strings to be unescaped, so escape them
+        # in order to transparently pass through the next unquote
+        # see https://github.com/ytdl-org/youtube-dl/issues/22704
+        webpage = re.sub(
+            "\"(.*?)\"",
+            lambda x: "\"" + compat_urllib_parse_quote(x.group(1)) + "\"",
+            webpage)
        # Sometimes embedded video player is hidden behind percent encoding
        # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
        # Unescaping the whole page allows to handle those cases in a generic way