1
0
mirror of https://codeberg.org/polarisfm/youtube-dl synced 2025-01-07 13:47:54 +01:00

Add Archive class for efficient archiving

Currently we reread the file for every new link we download.
In cases where user runs youtube_dl to keep their library
up-to-date, this is very inefficient, as most of the links
are already downloaded.
Archive class works half way between a database and an append
log. It's an append log that also keeps a hash set in memory
for fast checking existing links.
A compatibility function was added for reading file's last
modification date
Tests have been added
This commit is contained in:
Sergio Kef 2020-04-18 00:03:19 +02:00
parent 00eb865b3c
commit dd065a7305
4 changed files with 176 additions and 19 deletions

84
test/test_archive.py Normal file
View File

@ -0,0 +1,84 @@
#!/usr/bin/env python
from __future__ import unicode_literals
# Allow direct execution
import os
import sys
import unittest
from time import sleep
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.archive import Archive
class TestArchive(unittest.TestCase):
def setUp(self):
cur_dir = os.path.dirname(os.path.abspath(__file__))
test_archive = os.path.join(cur_dir, "test_archive.txt")
self.archive = Archive(test_archive)
def tearDown(self):
if os.path.exists(self.archive.filepath):
os.remove(self.archive.filepath)
def test_archive_disabled(self):
self.assertTrue(Archive(None)._disabled)
self.assertTrue(Archive("")._disabled)
self.assertFalse(Archive("/anything")._disabled)
def test_archive_read_empty_file(self):
self.archive._read_file()
def test_archive_exists(self):
self.archive.record_download("dl_id")
self.assertTrue("dl_id" in self.archive._data)
self.assertTrue("dl_id" in self.archive)
def test_archive_not_exists(self):
self.assertFalse("dl_id" in self.archive)
def test_archive_last_read_on_write(self):
t1 = self.archive._last_read
self.archive.record_download("dl_id")
t2 = self.archive._last_read
self.assertNotEqual(t1, t2)
def test_archive_last_read_on_read(self):
t1 = self.archive._last_read
self.archive.record_download("dl_id 1")
t2 = self.archive._last_read
self.assertNotEqual(t1, t2)
def test_archive_file_not_changed(self):
self.archive.record_download("dl_id")
self.assertFalse(self.archive._file_changed())
def test_archive_file_changed(self):
self.archive.record_download("dl_id 1")
with open(self.archive.filepath, "a", encoding="utf-8") as f_out:
sleep(0.01)
f_out.write("dl_id 2\n")
self.assertTrue(self.archive._file_changed())
def test_archive_file_changed_exists(self):
self.archive.record_download("dl_id 1")
with open(self.archive.filepath, "a", encoding="utf-8") as f_out:
sleep(0.01)
f_out.write("dl_id 2\n")
self.assertTrue(self.archive._file_changed())
self.assertFalse("dl_id 2" in self.archive._data)
self.assertTrue("dl_id 2" in self.archive)
def test_archive_multiple_writes(self):
self.archive.record_download("dl_id 1")
self.archive.record_download("dl_id 2")
self.archive.record_download("dl_id 3")
expected = "dl_id 1" + "\n" + "dl_id 2" + "\n" + "dl_id 3" + "\n"
with open(self.archive.filepath, "r", encoding="utf-8") as f_in:
self.assertEqual(f_in.read(), expected)
if __name__ == "__main__":
unittest.main()

View File

@ -28,6 +28,7 @@ import random
from string import ascii_letters
from .archive import Archive
from .compat import (
compat_basestring,
compat_cookiejar,
@ -416,6 +417,8 @@ class YoutubeDL(object):
'Parameter outtmpl is bytes, but should be a unicode string. '
'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
self.archive = Archive(self.params.get('download_archive'))
self._setup_opener()
if auto_init:
@ -2094,32 +2097,15 @@ class YoutubeDL(object):
return extractor.lower() + ' ' + video_id
def in_download_archive(self, info_dict):
fn = self.params.get('download_archive')
if fn is None:
return False
vid_id = self._make_archive_id(info_dict)
if not vid_id:
return False # Incomplete video information
try:
with locked_file(fn, 'r', encoding='utf-8') as archive_file:
for line in archive_file:
if line.strip() == vid_id:
return True
except IOError as ioe:
if ioe.errno != errno.ENOENT:
raise
return False
return vid_id in self.archive
def record_download_archive(self, info_dict):
fn = self.params.get('download_archive')
if fn is None:
return
vid_id = self._make_archive_id(info_dict)
assert vid_id
with locked_file(fn, 'a', encoding='utf-8') as archive_file:
archive_file.write(vid_id + '\n')
self.archive.record_download(vid_id)
@staticmethod
def format_resolution(format, default='unknown'):

78
youtube_dl/archive.py Normal file
View File

@ -0,0 +1,78 @@
#!/usr/bin/env python
# coding: utf-8
import errno
import os
from youtube_dl.compat import compat_st_mtime
from youtube_dl.utils import locked_file
class Archive(object):
""" Class that manages the download Archive. Provides optimizations to avoid
excessive file parsing.
Initializing options:
filepath: str The filepath of the archive.
Properties:
data: set Container for holding a cache of the archive
disabled: bool When true, all functions are effectively void
filepath: str The filepath of the archive
Public Methods:
__contains__ Checks a video id (archive id) exists in archive
record_download Adds a new download to archive
Private Methods:
_file_changed Checks if file has been modified since last read
_read_archive Reads archive from disk
"""
def __init__(self, filepath):
""" Instantiate Archive
filepath: str or None. The filepath of the archive. If None is provided
instance can still be used but it's effectively
doing nothing."""
self.filepath = None if filepath == "" else filepath
self._disabled = self.filepath is None
self._data = set()
self._last_read = 0
def record_download(self, archive_id):
""" Records a new archive_id in the archive """
if self._disabled:
return
with locked_file(self.filepath, "a", encoding="utf-8") as file_out:
file_out.write(archive_id + "\n")
self._last_read = compat_st_mtime(self.filepath)
self._data.add(archive_id)
def _file_changed(self):
""" Checks if file has been modified, using system Modification Date """
if os.path.exists(self.filepath):
return self._last_read < compat_st_mtime(self.filepath)
return True
def _read_file(self):
""" Reads the data from archive file """
if self._disabled or not self._file_changed():
return
try:
with locked_file(self.filepath, "r", encoding="utf-8") as file_in:
self._data.update(line.strip() for line in file_in)
self._last_read = compat_st_mtime(self.filepath)
except IOError as err:
if err.errno != errno.ENOENT:
raise
def __contains__(self, item):
if not isinstance(item, str):
raise ValueError(
"An archive contains only strings. Provided {}".format(type(item))
)
if item not in self._data:
self._read_file()
return item in self._data

View File

@ -2978,6 +2978,14 @@ else:
return ctypes.WINFUNCTYPE(*args, **kwargs)
def compat_st_mtime(path):
""" Py3 has nanosecond accuracy on modification time of file """
try:
return os.stat(path).st_mtime_ns
except ImportError:
return os.stat(path).st_mtime
__all__ = [
'compat_HTMLParseError',
'compat_HTMLParser',
@ -3014,6 +3022,7 @@ __all__ = [
'compat_shlex_quote',
'compat_shlex_split',
'compat_socket_create_connection',
'compat_st_mtime',
'compat_str',
'compat_struct_pack',
'compat_struct_unpack',