Add Archive class for efficient archiving

Currently we reread the file for every new link we download. In cases where user runs youtube_dl to keep their library up-to-date, this is very inefficient, as most of the links are already downloaded. Archive class works half way between a database and an append log. It's an append log that also keeps a hash set in memory for fast checking existing links. A compatibility function was added for reading file's last modification date Tests have been added
2025-01-07 13:47:54 +01:00 · 2020-04-18 00:03:19 +02:00 · 2020-04-18 00:03:19 +02:00 · dd065a7305
commit dd065a7305
parent 00eb865b3c
4 changed files with 176 additions and 19 deletions
--- a/test/test_archive.py
+++ b/test/test_archive.py
@ -0,0 +1,84 @@
+#!/usr/bin/env python
+
+from __future__ import unicode_literals
+
+# Allow direct execution
+import os
+import sys
+import unittest
+from time import sleep
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from youtube_dl.archive import Archive
+
+
+class TestArchive(unittest.TestCase):
+    def setUp(self):
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+        test_archive = os.path.join(cur_dir, "test_archive.txt")
+        self.archive = Archive(test_archive)
+
+    def tearDown(self):
+        if os.path.exists(self.archive.filepath):
+            os.remove(self.archive.filepath)
+
+    def test_archive_disabled(self):
+        self.assertTrue(Archive(None)._disabled)
+        self.assertTrue(Archive("")._disabled)
+        self.assertFalse(Archive("/anything")._disabled)
+
+    def test_archive_read_empty_file(self):
+        self.archive._read_file()
+
+    def test_archive_exists(self):
+        self.archive.record_download("dl_id")
+        self.assertTrue("dl_id" in self.archive._data)
+        self.assertTrue("dl_id" in self.archive)
+
+    def test_archive_not_exists(self):
+        self.assertFalse("dl_id" in self.archive)
+
+    def test_archive_last_read_on_write(self):
+        t1 = self.archive._last_read
+        self.archive.record_download("dl_id")
+        t2 = self.archive._last_read
+        self.assertNotEqual(t1, t2)
+
+    def test_archive_last_read_on_read(self):
+        t1 = self.archive._last_read
+        self.archive.record_download("dl_id 1")
+        t2 = self.archive._last_read
+        self.assertNotEqual(t1, t2)
+
+    def test_archive_file_not_changed(self):
+        self.archive.record_download("dl_id")
+        self.assertFalse(self.archive._file_changed())
+
+    def test_archive_file_changed(self):
+        self.archive.record_download("dl_id 1")
+        with open(self.archive.filepath, "a", encoding="utf-8") as f_out:
+            sleep(0.01)
+            f_out.write("dl_id 2\n")
+        self.assertTrue(self.archive._file_changed())
+
+    def test_archive_file_changed_exists(self):
+        self.archive.record_download("dl_id 1")
+        with open(self.archive.filepath, "a", encoding="utf-8") as f_out:
+            sleep(0.01)
+            f_out.write("dl_id 2\n")
+        self.assertTrue(self.archive._file_changed())
+        self.assertFalse("dl_id 2" in self.archive._data)
+        self.assertTrue("dl_id 2" in self.archive)
+
+    def test_archive_multiple_writes(self):
+        self.archive.record_download("dl_id 1")
+        self.archive.record_download("dl_id 2")
+        self.archive.record_download("dl_id 3")
+        expected = "dl_id 1" + "\n" + "dl_id 2" + "\n" + "dl_id 3" + "\n"
+        with open(self.archive.filepath, "r", encoding="utf-8") as f_in:
+            self.assertEqual(f_in.read(), expected)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -28,6 +28,7 @@ import random

 from string import ascii_letters

+from .archive import Archive
 from .compat import (
    compat_basestring,
    compat_cookiejar,
@ -416,6 +417,8 @@ class YoutubeDL(object):
                'Parameter outtmpl is bytes, but should be a unicode string. '
                'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')

+        self.archive = Archive(self.params.get('download_archive'))
+
        self._setup_opener()

        if auto_init:
@ -2094,32 +2097,15 @@ class YoutubeDL(object):
        return extractor.lower() + ' ' + video_id

    def in_download_archive(self, info_dict):
-        fn = self.params.get('download_archive')
-        if fn is None:
-            return False
-
        vid_id = self._make_archive_id(info_dict)
        if not vid_id:
            return False  # Incomplete video information
-
-        try:
-            with locked_file(fn, 'r', encoding='utf-8') as archive_file:
-                for line in archive_file:
-                    if line.strip() == vid_id:
-                        return True
-        except IOError as ioe:
-            if ioe.errno != errno.ENOENT:
-                raise
-        return False
+        return vid_id in self.archive

    def record_download_archive(self, info_dict):
-        fn = self.params.get('download_archive')
-        if fn is None:
-            return
        vid_id = self._make_archive_id(info_dict)
        assert vid_id
-        with locked_file(fn, 'a', encoding='utf-8') as archive_file:
-            archive_file.write(vid_id + '\n')
+        self.archive.record_download(vid_id)

    @staticmethod
    def format_resolution(format, default='unknown'):
--- a/youtube_dl/archive.py
+++ b/youtube_dl/archive.py
@ -0,0 +1,78 @@
+#!/usr/bin/env python
+# coding: utf-8
+import errno
+import os
+
+from youtube_dl.compat import compat_st_mtime
+from youtube_dl.utils import locked_file
+
+
+class Archive(object):
+    """ Class that manages the download Archive. Provides optimizations to avoid
+      excessive file parsing.
+
+      Initializing options:
+        filepath: str       The filepath of the archive.
+
+      Properties:
+        data: set           Container for holding a cache of the archive
+        disabled: bool      When true, all functions are effectively void
+        filepath: str       The filepath of the archive
+
+      Public Methods:
+        __contains__        Checks a video id (archive id) exists in archive
+        record_download     Adds a new download to archive
+
+      Private Methods:
+        _file_changed       Checks if file has been modified since last read
+        _read_archive       Reads archive from disk
+    """
+
+    def __init__(self, filepath):
+        """ Instantiate Archive
+        filepath: str or None. The filepath of the archive. If None is provided
+                                instance can still be used but it's effectively
+                                doing nothing."""
+
+        self.filepath = None if filepath == "" else filepath
+        self._disabled = self.filepath is None
+        self._data = set()
+        self._last_read = 0
+
+    def record_download(self, archive_id):
+        """ Records a new archive_id in the archive """
+        if self._disabled:
+            return
+
+        with locked_file(self.filepath, "a", encoding="utf-8") as file_out:
+            file_out.write(archive_id + "\n")
+        self._last_read = compat_st_mtime(self.filepath)
+        self._data.add(archive_id)
+
+    def _file_changed(self):
+        """ Checks if file has been modified, using system Modification Date """
+        if os.path.exists(self.filepath):
+            return self._last_read < compat_st_mtime(self.filepath)
+        return True
+
+    def _read_file(self):
+        """ Reads the data from archive file """
+        if self._disabled or not self._file_changed():
+            return
+        try:
+            with locked_file(self.filepath, "r", encoding="utf-8") as file_in:
+                self._data.update(line.strip() for line in file_in)
+                self._last_read = compat_st_mtime(self.filepath)
+        except IOError as err:
+            if err.errno != errno.ENOENT:
+                raise
+
+    def __contains__(self, item):
+        if not isinstance(item, str):
+            raise ValueError(
+                "An archive contains only strings. Provided {}".format(type(item))
+            )
+
+        if item not in self._data:
+            self._read_file()
+        return item in self._data
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@ -2978,6 +2978,14 @@ else:
        return ctypes.WINFUNCTYPE(*args, **kwargs)


+def compat_st_mtime(path):
+    """ Py3 has nanosecond accuracy on modification time of file """
+    try:
+        return os.stat(path).st_mtime_ns
+    except ImportError:
+        return os.stat(path).st_mtime
+
+
 __all__ = [
    'compat_HTMLParseError',
    'compat_HTMLParser',
@ -3014,6 +3022,7 @@ __all__ = [
    'compat_shlex_quote',
    'compat_shlex_split',
    'compat_socket_create_connection',
+    'compat_st_mtime',
    'compat_str',
    'compat_struct_pack',
    'compat_struct_unpack',