From a5d5a2c068b00a8118fa9e3c32a9d93f316b2edd Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sun, 19 Mar 2017 21:52:13 -0400 Subject: [PATCH] [generic] utf8 decode before re.match(), for Python 3 Otherwise we raise TypeError: can't use a string pattern on a bytes-like object This perhaps argues for putting it in is_html(), which already does this decoding. But of course plain whitespace isn't just html. So perhaps renaming is_html()? I dunno what is simpler. Let's start with this. --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 54fadf7d8..dc73d23ff 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1759,7 +1759,7 @@ class GenericIE(InfoExtractor): self._sort_formats(info_dict['formats']) return info_dict - if re.match(r'^\s+$', first_bytes): + if re.match(r'^\s+$', first_bytes.decode('utf-8', 'replace')): self._downloader.report_warning( 'First block is just whitespace? Continuing...') elif not is_html(first_bytes):