From 6206194c5ace95e5a825b4a58a395030804e7c20 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sun, 19 Mar 2017 20:52:25 -0400 Subject: [PATCH 1/3] [generic] Replace LazyYT test with skiplagged discourse.ubuntu.com has gone away, repalce with skiplagged.com. Be nice to have a non-frontpage URL that might be more stable, though I don't have one. Maybe this should move to html in test/test_InfoExtractor.py? --- youtube_dl/extractor/generic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a71d6bac0..b70b1dd6d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -902,12 +902,13 @@ class GenericIE(InfoExtractor): }, # LazyYT { - 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', + 'url': 'https://skiplagged.com/', 'info_dict': { - 'id': '1986', - 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse', + 'id': 'skiplagged', + 'title': 'Skiplagged: The smart way to find cheap flights', }, - 'playlist_mincount': 2, + 'playlist_mincount': 1, + 'add_ie': ['Youtube'], }, # Cinchcast embed { From 00bc75ca0115fa57ffc700357ba6ef86f3355bb9 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sun, 19 Mar 2017 21:01:47 -0400 Subject: [PATCH 2/3] [generic] Allow parsing when first 512 bytes are whitespace is_html(first_bytes) will fail if the first 512 bytes of the URL are all whitespace, for some weird reason. Such a case probably is not a direct video link, the case we're concerned about downloading inadvertently, since that wouldn't be a valid video binary file format. But it's still peculiar, so don't silently ignore it -- print a warning and continue on. --- youtube_dl/extractor/generic.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b70b1dd6d..54fadf7d8 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1759,9 +1759,12 @@ class GenericIE(InfoExtractor): self._sort_formats(info_dict['formats']) return info_dict - # Maybe it's a direct link to a video? - # Be careful not to download the whole thing! - if not is_html(first_bytes): + if re.match(r'^\s+$', first_bytes): + self._downloader.report_warning( + 'First block is just whitespace? Continuing...') + elif not is_html(first_bytes): + # Maybe it's a direct link to a video? + # Be careful not to download the whole thing! self._downloader.report_warning( 'URL could be a direct video link, returning it as such.') info_dict.update({ From a5d5a2c068b00a8118fa9e3c32a9d93f316b2edd Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sun, 19 Mar 2017 21:52:13 -0400 Subject: [PATCH 3/3] [generic] utf8 decode before re.match(), for Python 3 Otherwise we raise TypeError: can't use a string pattern on a bytes-like object This perhaps argues for putting it in is_html(), which already does this decoding. But of course plain whitespace isn't just html. So perhaps renaming is_html()? I dunno what is simpler. Let's start with this. --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 54fadf7d8..dc73d23ff 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1759,7 +1759,7 @@ class GenericIE(InfoExtractor): self._sort_formats(info_dict['formats']) return info_dict - if re.match(r'^\s+$', first_bytes): + if re.match(r'^\s+$', first_bytes.decode('utf-8', 'replace')): self._downloader.report_warning( 'First block is just whitespace? Continuing...') elif not is_html(first_bytes):