From dd065a73050b78691b8bb8435d4b39f48a6c38f8 Mon Sep 17 00:00:00 2001 From: Sergio Kef Date: Sat, 18 Apr 2020 00:03:19 +0200 Subject: [PATCH 1/8] Add Archive class for efficient archiving Currently we reread the file for every new link we download. In cases where user runs youtube_dl to keep their library up-to-date, this is very inefficient, as most of the links are already downloaded. Archive class works half way between a database and an append log. It's an append log that also keeps a hash set in memory for fast checking existing links. A compatibility function was added for reading file's last modification date Tests have been added --- test/test_archive.py | 84 +++++++++++++++++++++++++++++++++++++++++ youtube_dl/YoutubeDL.py | 24 +++--------- youtube_dl/archive.py | 78 ++++++++++++++++++++++++++++++++++++++ youtube_dl/compat.py | 9 +++++ 4 files changed, 176 insertions(+), 19 deletions(-) create mode 100644 test/test_archive.py create mode 100644 youtube_dl/archive.py diff --git a/test/test_archive.py b/test/test_archive.py new file mode 100644 index 000000000..6d731e6db --- /dev/null +++ b/test/test_archive.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +from time import sleep + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.archive import Archive + + +class TestArchive(unittest.TestCase): + def setUp(self): + cur_dir = os.path.dirname(os.path.abspath(__file__)) + test_archive = os.path.join(cur_dir, "test_archive.txt") + self.archive = Archive(test_archive) + + def tearDown(self): + if os.path.exists(self.archive.filepath): + os.remove(self.archive.filepath) + + def test_archive_disabled(self): + self.assertTrue(Archive(None)._disabled) + self.assertTrue(Archive("")._disabled) + self.assertFalse(Archive("/anything")._disabled) + + def test_archive_read_empty_file(self): + self.archive._read_file() + + def test_archive_exists(self): + self.archive.record_download("dl_id") + self.assertTrue("dl_id" in self.archive._data) + self.assertTrue("dl_id" in self.archive) + + def test_archive_not_exists(self): + self.assertFalse("dl_id" in self.archive) + + def test_archive_last_read_on_write(self): + t1 = self.archive._last_read + self.archive.record_download("dl_id") + t2 = self.archive._last_read + self.assertNotEqual(t1, t2) + + def test_archive_last_read_on_read(self): + t1 = self.archive._last_read + self.archive.record_download("dl_id 1") + t2 = self.archive._last_read + self.assertNotEqual(t1, t2) + + def test_archive_file_not_changed(self): + self.archive.record_download("dl_id") + self.assertFalse(self.archive._file_changed()) + + def test_archive_file_changed(self): + self.archive.record_download("dl_id 1") + with open(self.archive.filepath, "a", encoding="utf-8") as f_out: + sleep(0.01) + f_out.write("dl_id 2\n") + self.assertTrue(self.archive._file_changed()) + + def test_archive_file_changed_exists(self): + self.archive.record_download("dl_id 1") + with open(self.archive.filepath, "a", encoding="utf-8") as f_out: + sleep(0.01) + f_out.write("dl_id 2\n") + self.assertTrue(self.archive._file_changed()) + self.assertFalse("dl_id 2" in self.archive._data) + self.assertTrue("dl_id 2" in self.archive) + + def test_archive_multiple_writes(self): + self.archive.record_download("dl_id 1") + self.archive.record_download("dl_id 2") + self.archive.record_download("dl_id 3") + expected = "dl_id 1" + "\n" + "dl_id 2" + "\n" + "dl_id 3" + "\n" + with open(self.archive.filepath, "r", encoding="utf-8") as f_in: + self.assertEqual(f_in.read(), expected) + + +if __name__ == "__main__": + unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 19370f62b..271cbcad4 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -28,6 +28,7 @@ import random from string import ascii_letters +from .archive import Archive from .compat import ( compat_basestring, compat_cookiejar, @@ -416,6 +417,8 @@ class YoutubeDL(object): 'Parameter outtmpl is bytes, but should be a unicode string. ' 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') + self.archive = Archive(self.params.get('download_archive')) + self._setup_opener() if auto_init: @@ -2094,32 +2097,15 @@ class YoutubeDL(object): return extractor.lower() + ' ' + video_id def in_download_archive(self, info_dict): - fn = self.params.get('download_archive') - if fn is None: - return False - vid_id = self._make_archive_id(info_dict) if not vid_id: return False # Incomplete video information - - try: - with locked_file(fn, 'r', encoding='utf-8') as archive_file: - for line in archive_file: - if line.strip() == vid_id: - return True - except IOError as ioe: - if ioe.errno != errno.ENOENT: - raise - return False + return vid_id in self.archive def record_download_archive(self, info_dict): - fn = self.params.get('download_archive') - if fn is None: - return vid_id = self._make_archive_id(info_dict) assert vid_id - with locked_file(fn, 'a', encoding='utf-8') as archive_file: - archive_file.write(vid_id + '\n') + self.archive.record_download(vid_id) @staticmethod def format_resolution(format, default='unknown'): diff --git a/youtube_dl/archive.py b/youtube_dl/archive.py new file mode 100644 index 000000000..9fb9480e4 --- /dev/null +++ b/youtube_dl/archive.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# coding: utf-8 +import errno +import os + +from youtube_dl.compat import compat_st_mtime +from youtube_dl.utils import locked_file + + +class Archive(object): + """ Class that manages the download Archive. Provides optimizations to avoid + excessive file parsing. + + Initializing options: + filepath: str The filepath of the archive. + + Properties: + data: set Container for holding a cache of the archive + disabled: bool When true, all functions are effectively void + filepath: str The filepath of the archive + + Public Methods: + __contains__ Checks a video id (archive id) exists in archive + record_download Adds a new download to archive + + Private Methods: + _file_changed Checks if file has been modified since last read + _read_archive Reads archive from disk + """ + + def __init__(self, filepath): + """ Instantiate Archive + filepath: str or None. The filepath of the archive. If None is provided + instance can still be used but it's effectively + doing nothing.""" + + self.filepath = None if filepath == "" else filepath + self._disabled = self.filepath is None + self._data = set() + self._last_read = 0 + + def record_download(self, archive_id): + """ Records a new archive_id in the archive """ + if self._disabled: + return + + with locked_file(self.filepath, "a", encoding="utf-8") as file_out: + file_out.write(archive_id + "\n") + self._last_read = compat_st_mtime(self.filepath) + self._data.add(archive_id) + + def _file_changed(self): + """ Checks if file has been modified, using system Modification Date """ + if os.path.exists(self.filepath): + return self._last_read < compat_st_mtime(self.filepath) + return True + + def _read_file(self): + """ Reads the data from archive file """ + if self._disabled or not self._file_changed(): + return + try: + with locked_file(self.filepath, "r", encoding="utf-8") as file_in: + self._data.update(line.strip() for line in file_in) + self._last_read = compat_st_mtime(self.filepath) + except IOError as err: + if err.errno != errno.ENOENT: + raise + + def __contains__(self, item): + if not isinstance(item, str): + raise ValueError( + "An archive contains only strings. Provided {}".format(type(item)) + ) + + if item not in self._data: + self._read_file() + return item in self._data diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index d1b86bd13..3172b5105 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2978,6 +2978,14 @@ else: return ctypes.WINFUNCTYPE(*args, **kwargs) +def compat_st_mtime(path): + """ Py3 has nanosecond accuracy on modification time of file """ + try: + return os.stat(path).st_mtime_ns + except ImportError: + return os.stat(path).st_mtime + + __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', @@ -3014,6 +3022,7 @@ __all__ = [ 'compat_shlex_quote', 'compat_shlex_split', 'compat_socket_create_connection', + 'compat_st_mtime', 'compat_str', 'compat_struct_pack', 'compat_struct_unpack', From f947efd3e9931589baeef74946d9da651ca41fcb Mon Sep 17 00:00:00 2001 From: Sergio Kef Date: Sat, 18 Apr 2020 00:12:41 +0200 Subject: [PATCH 2/8] Improve docstring --- youtube_dl/archive.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/archive.py b/youtube_dl/archive.py index 9fb9480e4..643e48ffb 100644 --- a/youtube_dl/archive.py +++ b/youtube_dl/archive.py @@ -15,17 +15,16 @@ class Archive(object): filepath: str The filepath of the archive. Properties: - data: set Container for holding a cache of the archive - disabled: bool When true, all functions are effectively void + _data: set Container for holding a cache of the archive + _disabled: bool When true, all functions are effectively void filepath: str The filepath of the archive + _last_read: float The last modification date of the archive file - Public Methods: + Methods: __contains__ Checks a video id (archive id) exists in archive - record_download Adds a new download to archive - - Private Methods: _file_changed Checks if file has been modified since last read _read_archive Reads archive from disk + record_download Adds a new download to archive """ def __init__(self, filepath): From e58f3a61fef490fe044f439dafa579e2af2f9ceb Mon Sep 17 00:00:00 2001 From: Sergio Kef Date: Mon, 20 Apr 2020 15:11:17 +0200 Subject: [PATCH 3/8] Remove logic checking if file has changed --- test/test_archive.py | 32 -------------------------------- youtube_dl/archive.py | 20 +++++--------------- 2 files changed, 5 insertions(+), 47 deletions(-) diff --git a/test/test_archive.py b/test/test_archive.py index 6d731e6db..932c21ba5 100644 --- a/test/test_archive.py +++ b/test/test_archive.py @@ -39,38 +39,6 @@ class TestArchive(unittest.TestCase): def test_archive_not_exists(self): self.assertFalse("dl_id" in self.archive) - def test_archive_last_read_on_write(self): - t1 = self.archive._last_read - self.archive.record_download("dl_id") - t2 = self.archive._last_read - self.assertNotEqual(t1, t2) - - def test_archive_last_read_on_read(self): - t1 = self.archive._last_read - self.archive.record_download("dl_id 1") - t2 = self.archive._last_read - self.assertNotEqual(t1, t2) - - def test_archive_file_not_changed(self): - self.archive.record_download("dl_id") - self.assertFalse(self.archive._file_changed()) - - def test_archive_file_changed(self): - self.archive.record_download("dl_id 1") - with open(self.archive.filepath, "a", encoding="utf-8") as f_out: - sleep(0.01) - f_out.write("dl_id 2\n") - self.assertTrue(self.archive._file_changed()) - - def test_archive_file_changed_exists(self): - self.archive.record_download("dl_id 1") - with open(self.archive.filepath, "a", encoding="utf-8") as f_out: - sleep(0.01) - f_out.write("dl_id 2\n") - self.assertTrue(self.archive._file_changed()) - self.assertFalse("dl_id 2" in self.archive._data) - self.assertTrue("dl_id 2" in self.archive) - def test_archive_multiple_writes(self): self.archive.record_download("dl_id 1") self.archive.record_download("dl_id 2") diff --git a/youtube_dl/archive.py b/youtube_dl/archive.py index 643e48ffb..13106277b 100644 --- a/youtube_dl/archive.py +++ b/youtube_dl/archive.py @@ -18,25 +18,22 @@ class Archive(object): _data: set Container for holding a cache of the archive _disabled: bool When true, all functions are effectively void filepath: str The filepath of the archive - _last_read: float The last modification date of the archive file Methods: __contains__ Checks a video id (archive id) exists in archive - _file_changed Checks if file has been modified since last read _read_archive Reads archive from disk record_download Adds a new download to archive """ def __init__(self, filepath): """ Instantiate Archive - filepath: str or None. The filepath of the archive. If None is provided - instance can still be used but it's effectively - doing nothing.""" + filepath: str or None. The filepath of the archive. If None or empty + string is provided instance can still be used + but it's effectively doing nothing.""" self.filepath = None if filepath == "" else filepath self._disabled = self.filepath is None self._data = set() - self._last_read = 0 def record_download(self, archive_id): """ Records a new archive_id in the archive """ @@ -45,23 +42,16 @@ class Archive(object): with locked_file(self.filepath, "a", encoding="utf-8") as file_out: file_out.write(archive_id + "\n") - self._last_read = compat_st_mtime(self.filepath) self._data.add(archive_id) - def _file_changed(self): - """ Checks if file has been modified, using system Modification Date """ - if os.path.exists(self.filepath): - return self._last_read < compat_st_mtime(self.filepath) - return True - def _read_file(self): """ Reads the data from archive file """ - if self._disabled or not self._file_changed(): + if self._disabled: return + try: with locked_file(self.filepath, "r", encoding="utf-8") as file_in: self._data.update(line.strip() for line in file_in) - self._last_read = compat_st_mtime(self.filepath) except IOError as err: if err.errno != errno.ENOENT: raise From 4a95f62d1d70491ded689b2dd2cd1438adaa0bd4 Mon Sep 17 00:00:00 2001 From: Sergio Kef Date: Thu, 23 Apr 2020 20:38:24 +0200 Subject: [PATCH 4/8] Remove unused function --- youtube_dl/archive.py | 2 -- youtube_dl/compat.py | 8 -------- 2 files changed, 10 deletions(-) diff --git a/youtube_dl/archive.py b/youtube_dl/archive.py index 13106277b..e7940b9db 100644 --- a/youtube_dl/archive.py +++ b/youtube_dl/archive.py @@ -1,9 +1,7 @@ #!/usr/bin/env python # coding: utf-8 import errno -import os -from youtube_dl.compat import compat_st_mtime from youtube_dl.utils import locked_file diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 3172b5105..2f43d987c 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2978,14 +2978,6 @@ else: return ctypes.WINFUNCTYPE(*args, **kwargs) -def compat_st_mtime(path): - """ Py3 has nanosecond accuracy on modification time of file """ - try: - return os.stat(path).st_mtime_ns - except ImportError: - return os.stat(path).st_mtime - - __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', From cf85d36c5f442b7ce54d1ae83f3632eac0671be9 Mon Sep 17 00:00:00 2001 From: Sergio Kef Date: Thu, 23 Apr 2020 20:38:56 +0200 Subject: [PATCH 5/8] Remove unused function --- youtube_dl/compat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 2f43d987c..d1b86bd13 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -3014,7 +3014,6 @@ __all__ = [ 'compat_shlex_quote', 'compat_shlex_split', 'compat_socket_create_connection', - 'compat_st_mtime', 'compat_str', 'compat_struct_pack', 'compat_struct_unpack', From 4194bb0306e989e6a0844cd074271b2f5b15450f Mon Sep 17 00:00:00 2001 From: Sergio Kef Date: Thu, 23 Apr 2020 20:39:23 +0200 Subject: [PATCH 6/8] Remove unused import --- test/test_archive.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_archive.py b/test/test_archive.py index 932c21ba5..6ec6fe9b4 100644 --- a/test/test_archive.py +++ b/test/test_archive.py @@ -6,7 +6,6 @@ from __future__ import unicode_literals import os import sys import unittest -from time import sleep sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) From 66a3e4551dcde47065d967b2b0639fd1f1d95f30 Mon Sep 17 00:00:00 2001 From: Sergio Kef Date: Thu, 23 Apr 2020 20:42:55 +0200 Subject: [PATCH 7/8] Code style --- youtube_dl/archive.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/archive.py b/youtube_dl/archive.py index e7940b9db..035a3c024 100644 --- a/youtube_dl/archive.py +++ b/youtube_dl/archive.py @@ -57,8 +57,7 @@ class Archive(object): def __contains__(self, item): if not isinstance(item, str): raise ValueError( - "An archive contains only strings. Provided {}".format(type(item)) - ) + "An archive contains only strings. Provided {}".format(type(item))) if item not in self._data: self._read_file() From b2d61f9698d23157f0cd22f78956098c2ab1e5f5 Mon Sep 17 00:00:00 2001 From: Sergio Kef Date: Sat, 25 Apr 2020 13:46:14 +0200 Subject: [PATCH 8/8] Refactor --- test/test_archive.py | 72 +++++++++++++++++++++-------------------- youtube_dl/YoutubeDL.py | 33 ++++++++++++++++--- youtube_dl/archive.py | 64 ------------------------------------ 3 files changed, 66 insertions(+), 103 deletions(-) delete mode 100644 youtube_dl/archive.py diff --git a/test/test_archive.py b/test/test_archive.py index 6ec6fe9b4..64aca45ad 100644 --- a/test/test_archive.py +++ b/test/test_archive.py @@ -9,42 +9,44 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.archive import Archive +# TODO refactor test to actually test YoutubeDL archive functionality - -class TestArchive(unittest.TestCase): - def setUp(self): - cur_dir = os.path.dirname(os.path.abspath(__file__)) - test_archive = os.path.join(cur_dir, "test_archive.txt") - self.archive = Archive(test_archive) - - def tearDown(self): - if os.path.exists(self.archive.filepath): - os.remove(self.archive.filepath) - - def test_archive_disabled(self): - self.assertTrue(Archive(None)._disabled) - self.assertTrue(Archive("")._disabled) - self.assertFalse(Archive("/anything")._disabled) - - def test_archive_read_empty_file(self): - self.archive._read_file() - - def test_archive_exists(self): - self.archive.record_download("dl_id") - self.assertTrue("dl_id" in self.archive._data) - self.assertTrue("dl_id" in self.archive) - - def test_archive_not_exists(self): - self.assertFalse("dl_id" in self.archive) - - def test_archive_multiple_writes(self): - self.archive.record_download("dl_id 1") - self.archive.record_download("dl_id 2") - self.archive.record_download("dl_id 3") - expected = "dl_id 1" + "\n" + "dl_id 2" + "\n" + "dl_id 3" + "\n" - with open(self.archive.filepath, "r", encoding="utf-8") as f_in: - self.assertEqual(f_in.read(), expected) +# from youtube_dl.archive import Archive +# +# +# class TestArchive(unittest.TestCase): +# def setUp(self): +# cur_dir = os.path.dirname(os.path.abspath(__file__)) +# test_archive = os.path.join(cur_dir, "test_archive.txt") +# self.archive = Archive(test_archive) +# +# def tearDown(self): +# if os.path.exists(self.archive.filepath): +# os.remove(self.archive.filepath) +# +# def test_archive_disabled(self): +# self.assertTrue(Archive(None)._disabled) +# self.assertTrue(Archive("")._disabled) +# self.assertFalse(Archive("/anything")._disabled) +# +# def test_archive_read_empty_file(self): +# self.archive._read_file() +# +# def test_archive_exists(self): +# self.archive.record_download("dl_id") +# self.assertTrue("dl_id" in self.archive._data) +# self.assertTrue("dl_id" in self.archive) +# +# def test_archive_not_exists(self): +# self.assertFalse("dl_id" in self.archive) +# +# def test_archive_multiple_writes(self): +# self.archive.record_download("dl_id 1") +# self.archive.record_download("dl_id 2") +# self.archive.record_download("dl_id 3") +# expected = "dl_id 1" + "\n" + "dl_id 2" + "\n" + "dl_id 3" + "\n" +# with open(self.archive.filepath, "r", encoding="utf-8") as f_in: +# self.assertEqual(f_in.read(), expected) if __name__ == "__main__": diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 271cbcad4..7afab0f16 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -28,7 +28,6 @@ import random from string import ascii_letters -from .archive import Archive from .compat import ( compat_basestring, compat_cookiejar, @@ -417,7 +416,8 @@ class YoutubeDL(object): 'Parameter outtmpl is bytes, but should be a unicode string. ' 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') - self.archive = Archive(self.params.get('download_archive')) + self._archive_filepath = self.params.get('download_archive') + self._archive_set = set() self._setup_opener() @@ -2097,15 +2097,40 @@ class YoutubeDL(object): return extractor.lower() + ' ' + video_id def in_download_archive(self, info_dict): + """ Checks if video from info_dict exists in archive""" + + if self._archive_filepath is None: + return False + vid_id = self._make_archive_id(info_dict) if not vid_id: return False # Incomplete video information - return vid_id in self.archive + + if vid_id in self._archive_set: + return True + + # Read the archive file + try: + with locked_file(self._archive_filepath, 'r', encoding='utf-8') as archive_file: + for line in archive_file: + self._archive_set.add(line.strip()) + if line.strip() == vid_id: + return True + except IOError as ioe: + if ioe.errno != errno.ENOENT: + raise + return False def record_download_archive(self, info_dict): + """ Records a new entry in the archive for the info_dict """ + + if self._archive_filepath is None: + return vid_id = self._make_archive_id(info_dict) assert vid_id - self.archive.record_download(vid_id) + with locked_file(self._archive_filepath, 'a', encoding='utf-8') as archive_file: + archive_file.write(vid_id + '\n') + self._archive_set.add(vid_id) @staticmethod def format_resolution(format, default='unknown'): diff --git a/youtube_dl/archive.py b/youtube_dl/archive.py deleted file mode 100644 index 035a3c024..000000000 --- a/youtube_dl/archive.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -import errno - -from youtube_dl.utils import locked_file - - -class Archive(object): - """ Class that manages the download Archive. Provides optimizations to avoid - excessive file parsing. - - Initializing options: - filepath: str The filepath of the archive. - - Properties: - _data: set Container for holding a cache of the archive - _disabled: bool When true, all functions are effectively void - filepath: str The filepath of the archive - - Methods: - __contains__ Checks a video id (archive id) exists in archive - _read_archive Reads archive from disk - record_download Adds a new download to archive - """ - - def __init__(self, filepath): - """ Instantiate Archive - filepath: str or None. The filepath of the archive. If None or empty - string is provided instance can still be used - but it's effectively doing nothing.""" - - self.filepath = None if filepath == "" else filepath - self._disabled = self.filepath is None - self._data = set() - - def record_download(self, archive_id): - """ Records a new archive_id in the archive """ - if self._disabled: - return - - with locked_file(self.filepath, "a", encoding="utf-8") as file_out: - file_out.write(archive_id + "\n") - self._data.add(archive_id) - - def _read_file(self): - """ Reads the data from archive file """ - if self._disabled: - return - - try: - with locked_file(self.filepath, "r", encoding="utf-8") as file_in: - self._data.update(line.strip() for line in file_in) - except IOError as err: - if err.errno != errno.ENOENT: - raise - - def __contains__(self, item): - if not isinstance(item, str): - raise ValueError( - "An archive contains only strings. Provided {}".format(type(item))) - - if item not in self._data: - self._read_file() - return item in self._data