mirror of
https://codeberg.org/polarisfm/youtube-dl
synced 2024-11-22 16:44:32 +01:00
[generic] fix url src's unescape(closes ytdl-org#22704)
This commit is contained in:
parent
c317b6163b
commit
d12babb5d9
@ -21,6 +21,7 @@ from youtube_dl.compat import (
|
|||||||
compat_struct_unpack,
|
compat_struct_unpack,
|
||||||
compat_urllib_parse_unquote,
|
compat_urllib_parse_unquote,
|
||||||
compat_urllib_parse_unquote_plus,
|
compat_urllib_parse_unquote_plus,
|
||||||
|
compat_urllib_parse_quote,
|
||||||
compat_urllib_parse_urlencode,
|
compat_urllib_parse_urlencode,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -76,6 +77,15 @@ class TestCompat(unittest.TestCase):
|
|||||||
self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def')
|
self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def')
|
||||||
self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def')
|
self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def')
|
||||||
|
|
||||||
|
def test_compat_urllib_parse_quote(self):
|
||||||
|
self.assertEqual(compat_urllib_parse_quote('abc def'), 'abc%20def')
|
||||||
|
self.assertEqual(compat_urllib_parse_quote('~/abc+def'), '~/abc%2Bdef')
|
||||||
|
self.assertEqual(compat_urllib_parse_quote(''), '')
|
||||||
|
self.assertEqual(compat_urllib_parse_quote('%'), '%25')
|
||||||
|
self.assertEqual(compat_urllib_parse_quote('%%'), '%25%25')
|
||||||
|
self.assertEqual(compat_urllib_parse_quote('%%%'), '%25%25%25')
|
||||||
|
self.assertEqual(compat_urllib_parse_quote('/'), '/')
|
||||||
|
|
||||||
def test_compat_urllib_parse_urlencode(self):
|
def test_compat_urllib_parse_urlencode(self):
|
||||||
self.assertEqual(compat_urllib_parse_urlencode({'abc': 'def'}), 'abc=def')
|
self.assertEqual(compat_urllib_parse_urlencode({'abc': 'def'}), 'abc=def')
|
||||||
self.assertEqual(compat_urllib_parse_urlencode({'abc': b'def'}), 'abc=def')
|
self.assertEqual(compat_urllib_parse_urlencode({'abc': b'def'}), 'abc=def')
|
||||||
|
@ -2358,10 +2358,23 @@ try:
|
|||||||
from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes
|
from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes
|
||||||
from urllib.parse import unquote as compat_urllib_parse_unquote
|
from urllib.parse import unquote as compat_urllib_parse_unquote
|
||||||
from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus
|
from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus
|
||||||
|
from urllib.parse import quote as _compat_urllib_parse_quote
|
||||||
|
|
||||||
|
def compat_urllib_parse_quote(string, safe='~/', encoding=None, errors=None):
|
||||||
|
return _compat_urllib_parse_quote(string, safe, encoding, errors)
|
||||||
|
|
||||||
except ImportError: # Python 2
|
except ImportError: # Python 2
|
||||||
_asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire')
|
_asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire')
|
||||||
else re.compile(r'([\x00-\x7f]+)'))
|
else re.compile(r'([\x00-\x7f]+)'))
|
||||||
|
|
||||||
|
_always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||||
|
'abcdefghijklmnopqrstuvwxyz'
|
||||||
|
'0123456789' '_.-')
|
||||||
|
_safe_map = {}
|
||||||
|
for i, c in zip(xrange(256), str(bytearray(xrange(256)))):
|
||||||
|
_safe_map[c] = c if (i < 128 and c in _always_safe) else '%{0:02X}'.format(i)
|
||||||
|
_safe_quoters = {}
|
||||||
|
|
||||||
# HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus
|
# HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus
|
||||||
# implementations from cpython 3.4.3's stdlib. Python 2's version
|
# implementations from cpython 3.4.3's stdlib. Python 2's version
|
||||||
# is apparently broken (see https://github.com/ytdl-org/youtube-dl/pull/6244)
|
# is apparently broken (see https://github.com/ytdl-org/youtube-dl/pull/6244)
|
||||||
@ -2424,6 +2437,45 @@ except ImportError: # Python 2
|
|||||||
string = string.replace('+', ' ')
|
string = string.replace('+', ' ')
|
||||||
return compat_urllib_parse_unquote(string, encoding, errors)
|
return compat_urllib_parse_unquote(string, encoding, errors)
|
||||||
|
|
||||||
|
def compat_urllib_parse_quote(s, safe='~/'):
|
||||||
|
"""quote('abc def') -> 'abc%20def'
|
||||||
|
|
||||||
|
Each part of a URL, e.g. the path info, the query, etc., has a
|
||||||
|
different set of reserved characters that must be quoted.
|
||||||
|
|
||||||
|
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
|
||||||
|
the following reserved characters.
|
||||||
|
|
||||||
|
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
|
||||||
|
"$" | ","
|
||||||
|
|
||||||
|
Each of these characters is reserved in some component of a URL,
|
||||||
|
but not necessarily in all of them.
|
||||||
|
|
||||||
|
By default, the quote function is intended for quoting the path
|
||||||
|
section of a URL. Thus, it will not encode '/'. This character
|
||||||
|
is reserved, but in typical usage the quote function is being
|
||||||
|
called on a path where the existing slash characters are used as
|
||||||
|
reserved characters.
|
||||||
|
"""
|
||||||
|
# fastpath
|
||||||
|
if not s:
|
||||||
|
if s is None:
|
||||||
|
raise TypeError('None object cannot be quoted')
|
||||||
|
return s
|
||||||
|
cachekey = (safe, _always_safe)
|
||||||
|
try:
|
||||||
|
(quoter, safe) = _safe_quoters[cachekey]
|
||||||
|
except KeyError:
|
||||||
|
safe_map = _safe_map.copy()
|
||||||
|
safe_map.update([(c, c) for c in safe])
|
||||||
|
quoter = safe_map.__getitem__
|
||||||
|
safe = _always_safe + safe
|
||||||
|
_safe_quoters[cachekey] = (quoter, safe)
|
||||||
|
if not s.rstrip(safe):
|
||||||
|
return s
|
||||||
|
return ''.join(map(quoter, s))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from urllib.parse import urlencode as compat_urllib_parse_urlencode
|
from urllib.parse import urlencode as compat_urllib_parse_urlencode
|
||||||
except ImportError: # Python 2
|
except ImportError: # Python 2
|
||||||
@ -3009,6 +3061,7 @@ __all__ = [
|
|||||||
'compat_tokenize_tokenize',
|
'compat_tokenize_tokenize',
|
||||||
'compat_urllib_error',
|
'compat_urllib_error',
|
||||||
'compat_urllib_parse',
|
'compat_urllib_parse',
|
||||||
|
'compat_urllib_parse_quote',
|
||||||
'compat_urllib_parse_unquote',
|
'compat_urllib_parse_unquote',
|
||||||
'compat_urllib_parse_unquote_plus',
|
'compat_urllib_parse_unquote_plus',
|
||||||
'compat_urllib_parse_unquote_to_bytes',
|
'compat_urllib_parse_unquote_to_bytes',
|
||||||
|
@ -12,6 +12,7 @@ from ..compat import (
|
|||||||
compat_etree_fromstring,
|
compat_etree_fromstring,
|
||||||
compat_str,
|
compat_str,
|
||||||
compat_urllib_parse_unquote,
|
compat_urllib_parse_unquote,
|
||||||
|
compat_urllib_parse_quote,
|
||||||
compat_urlparse,
|
compat_urlparse,
|
||||||
compat_xml_parse_error,
|
compat_xml_parse_error,
|
||||||
)
|
)
|
||||||
@ -2405,6 +2406,13 @@ class GenericIE(InfoExtractor):
|
|||||||
if camtasia_res is not None:
|
if camtasia_res is not None:
|
||||||
return camtasia_res
|
return camtasia_res
|
||||||
|
|
||||||
|
# We don't want strings to be unescaped, so escape them
|
||||||
|
# in order to transparently pass through the next unquote
|
||||||
|
# see https://github.com/ytdl-org/youtube-dl/issues/22704
|
||||||
|
webpage = re.sub(
|
||||||
|
"\"(.*?)\"",
|
||||||
|
lambda x: "\"" + compat_urllib_parse_quote(x.group(1)) + "\"",
|
||||||
|
webpage)
|
||||||
# Sometimes embedded video player is hidden behind percent encoding
|
# Sometimes embedded video player is hidden behind percent encoding
|
||||||
# (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
|
# (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
|
||||||
# Unescaping the whole page allows to handle those cases in a generic way
|
# Unescaping the whole page allows to handle those cases in a generic way
|
||||||
|
Loading…
Reference in New Issue
Block a user