diff --git a/test/test_archive.py b/test/test_archive.py new file mode 100644 index 000000000..6d731e6db --- /dev/null +++ b/test/test_archive.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +from time import sleep + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.archive import Archive + + +class TestArchive(unittest.TestCase): + def setUp(self): + cur_dir = os.path.dirname(os.path.abspath(__file__)) + test_archive = os.path.join(cur_dir, "test_archive.txt") + self.archive = Archive(test_archive) + + def tearDown(self): + if os.path.exists(self.archive.filepath): + os.remove(self.archive.filepath) + + def test_archive_disabled(self): + self.assertTrue(Archive(None)._disabled) + self.assertTrue(Archive("")._disabled) + self.assertFalse(Archive("/anything")._disabled) + + def test_archive_read_empty_file(self): + self.archive._read_file() + + def test_archive_exists(self): + self.archive.record_download("dl_id") + self.assertTrue("dl_id" in self.archive._data) + self.assertTrue("dl_id" in self.archive) + + def test_archive_not_exists(self): + self.assertFalse("dl_id" in self.archive) + + def test_archive_last_read_on_write(self): + t1 = self.archive._last_read + self.archive.record_download("dl_id") + t2 = self.archive._last_read + self.assertNotEqual(t1, t2) + + def test_archive_last_read_on_read(self): + t1 = self.archive._last_read + self.archive.record_download("dl_id 1") + t2 = self.archive._last_read + self.assertNotEqual(t1, t2) + + def test_archive_file_not_changed(self): + self.archive.record_download("dl_id") + self.assertFalse(self.archive._file_changed()) + + def test_archive_file_changed(self): + self.archive.record_download("dl_id 1") + with open(self.archive.filepath, "a", encoding="utf-8") as f_out: + sleep(0.01) + f_out.write("dl_id 2\n") + self.assertTrue(self.archive._file_changed()) + + def test_archive_file_changed_exists(self): + self.archive.record_download("dl_id 1") + with open(self.archive.filepath, "a", encoding="utf-8") as f_out: + sleep(0.01) + f_out.write("dl_id 2\n") + self.assertTrue(self.archive._file_changed()) + self.assertFalse("dl_id 2" in self.archive._data) + self.assertTrue("dl_id 2" in self.archive) + + def test_archive_multiple_writes(self): + self.archive.record_download("dl_id 1") + self.archive.record_download("dl_id 2") + self.archive.record_download("dl_id 3") + expected = "dl_id 1" + "\n" + "dl_id 2" + "\n" + "dl_id 3" + "\n" + with open(self.archive.filepath, "r", encoding="utf-8") as f_in: + self.assertEqual(f_in.read(), expected) + + +if __name__ == "__main__": + unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 19370f62b..271cbcad4 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -28,6 +28,7 @@ import random from string import ascii_letters +from .archive import Archive from .compat import ( compat_basestring, compat_cookiejar, @@ -416,6 +417,8 @@ class YoutubeDL(object): 'Parameter outtmpl is bytes, but should be a unicode string. ' 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') + self.archive = Archive(self.params.get('download_archive')) + self._setup_opener() if auto_init: @@ -2094,32 +2097,15 @@ class YoutubeDL(object): return extractor.lower() + ' ' + video_id def in_download_archive(self, info_dict): - fn = self.params.get('download_archive') - if fn is None: - return False - vid_id = self._make_archive_id(info_dict) if not vid_id: return False # Incomplete video information - - try: - with locked_file(fn, 'r', encoding='utf-8') as archive_file: - for line in archive_file: - if line.strip() == vid_id: - return True - except IOError as ioe: - if ioe.errno != errno.ENOENT: - raise - return False + return vid_id in self.archive def record_download_archive(self, info_dict): - fn = self.params.get('download_archive') - if fn is None: - return vid_id = self._make_archive_id(info_dict) assert vid_id - with locked_file(fn, 'a', encoding='utf-8') as archive_file: - archive_file.write(vid_id + '\n') + self.archive.record_download(vid_id) @staticmethod def format_resolution(format, default='unknown'): diff --git a/youtube_dl/archive.py b/youtube_dl/archive.py new file mode 100644 index 000000000..9fb9480e4 --- /dev/null +++ b/youtube_dl/archive.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# coding: utf-8 +import errno +import os + +from youtube_dl.compat import compat_st_mtime +from youtube_dl.utils import locked_file + + +class Archive(object): + """ Class that manages the download Archive. Provides optimizations to avoid + excessive file parsing. + + Initializing options: + filepath: str The filepath of the archive. + + Properties: + data: set Container for holding a cache of the archive + disabled: bool When true, all functions are effectively void + filepath: str The filepath of the archive + + Public Methods: + __contains__ Checks a video id (archive id) exists in archive + record_download Adds a new download to archive + + Private Methods: + _file_changed Checks if file has been modified since last read + _read_archive Reads archive from disk + """ + + def __init__(self, filepath): + """ Instantiate Archive + filepath: str or None. The filepath of the archive. If None is provided + instance can still be used but it's effectively + doing nothing.""" + + self.filepath = None if filepath == "" else filepath + self._disabled = self.filepath is None + self._data = set() + self._last_read = 0 + + def record_download(self, archive_id): + """ Records a new archive_id in the archive """ + if self._disabled: + return + + with locked_file(self.filepath, "a", encoding="utf-8") as file_out: + file_out.write(archive_id + "\n") + self._last_read = compat_st_mtime(self.filepath) + self._data.add(archive_id) + + def _file_changed(self): + """ Checks if file has been modified, using system Modification Date """ + if os.path.exists(self.filepath): + return self._last_read < compat_st_mtime(self.filepath) + return True + + def _read_file(self): + """ Reads the data from archive file """ + if self._disabled or not self._file_changed(): + return + try: + with locked_file(self.filepath, "r", encoding="utf-8") as file_in: + self._data.update(line.strip() for line in file_in) + self._last_read = compat_st_mtime(self.filepath) + except IOError as err: + if err.errno != errno.ENOENT: + raise + + def __contains__(self, item): + if not isinstance(item, str): + raise ValueError( + "An archive contains only strings. Provided {}".format(type(item)) + ) + + if item not in self._data: + self._read_file() + return item in self._data diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index d1b86bd13..3172b5105 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2978,6 +2978,14 @@ else: return ctypes.WINFUNCTYPE(*args, **kwargs) +def compat_st_mtime(path): + """ Py3 has nanosecond accuracy on modification time of file """ + try: + return os.stat(path).st_mtime_ns + except ImportError: + return os.stat(path).st_mtime + + __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', @@ -3014,6 +3022,7 @@ __all__ = [ 'compat_shlex_quote', 'compat_shlex_split', 'compat_socket_create_connection', + 'compat_st_mtime', 'compat_str', 'compat_struct_pack', 'compat_struct_unpack',