From a5d5a2c068b00a8118fa9e3c32a9d93f316b2edd Mon Sep 17 00:00:00 2001
From: John Hawkinson <jhawk@mit.edu>
Date: Sun, 19 Mar 2017 21:52:13 -0400
Subject: [PATCH] [generic] utf8 decode before re.match(), for Python 3

Otherwise we raise
  TypeError: can't use a string pattern on a bytes-like object
This perhaps argues for putting it in is_html(), which already
does this decoding. But of course plain whitespace isn't just
html. So perhaps renaming is_html()? I dunno what is simpler.
Let's start with this.
---
 youtube_dl/extractor/generic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 54fadf7d8..dc73d23ff 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1759,7 +1759,7 @@ class GenericIE(InfoExtractor):
             self._sort_formats(info_dict['formats'])
             return info_dict
 
-        if re.match(r'^\s+$', first_bytes):
+        if re.match(r'^\s+$', first_bytes.decode('utf-8', 'replace')):
             self._downloader.report_warning(
                 'First block is just whitespace? Continuing...')
         elif not is_html(first_bytes):