mirror of
https://codeberg.org/polarisfm/youtube-dl
synced 2025-01-07 13:47:54 +01:00
Add Archive class for efficient archiving
Currently we reread the file for every new link we download. In cases where user runs youtube_dl to keep their library up-to-date, this is very inefficient, as most of the links are already downloaded. Archive class works half way between a database and an append log. It's an append log that also keeps a hash set in memory for fast checking existing links. A compatibility function was added for reading file's last modification date Tests have been added
This commit is contained in:
parent
00eb865b3c
commit
dd065a7305
84
test/test_archive.py
Normal file
84
test/test_archive.py
Normal file
@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# Allow direct execution
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
from time import sleep
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from youtube_dl.archive import Archive
|
||||
|
||||
|
||||
class TestArchive(unittest.TestCase):
|
||||
def setUp(self):
|
||||
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
test_archive = os.path.join(cur_dir, "test_archive.txt")
|
||||
self.archive = Archive(test_archive)
|
||||
|
||||
def tearDown(self):
|
||||
if os.path.exists(self.archive.filepath):
|
||||
os.remove(self.archive.filepath)
|
||||
|
||||
def test_archive_disabled(self):
|
||||
self.assertTrue(Archive(None)._disabled)
|
||||
self.assertTrue(Archive("")._disabled)
|
||||
self.assertFalse(Archive("/anything")._disabled)
|
||||
|
||||
def test_archive_read_empty_file(self):
|
||||
self.archive._read_file()
|
||||
|
||||
def test_archive_exists(self):
|
||||
self.archive.record_download("dl_id")
|
||||
self.assertTrue("dl_id" in self.archive._data)
|
||||
self.assertTrue("dl_id" in self.archive)
|
||||
|
||||
def test_archive_not_exists(self):
|
||||
self.assertFalse("dl_id" in self.archive)
|
||||
|
||||
def test_archive_last_read_on_write(self):
|
||||
t1 = self.archive._last_read
|
||||
self.archive.record_download("dl_id")
|
||||
t2 = self.archive._last_read
|
||||
self.assertNotEqual(t1, t2)
|
||||
|
||||
def test_archive_last_read_on_read(self):
|
||||
t1 = self.archive._last_read
|
||||
self.archive.record_download("dl_id 1")
|
||||
t2 = self.archive._last_read
|
||||
self.assertNotEqual(t1, t2)
|
||||
|
||||
def test_archive_file_not_changed(self):
|
||||
self.archive.record_download("dl_id")
|
||||
self.assertFalse(self.archive._file_changed())
|
||||
|
||||
def test_archive_file_changed(self):
|
||||
self.archive.record_download("dl_id 1")
|
||||
with open(self.archive.filepath, "a", encoding="utf-8") as f_out:
|
||||
sleep(0.01)
|
||||
f_out.write("dl_id 2\n")
|
||||
self.assertTrue(self.archive._file_changed())
|
||||
|
||||
def test_archive_file_changed_exists(self):
|
||||
self.archive.record_download("dl_id 1")
|
||||
with open(self.archive.filepath, "a", encoding="utf-8") as f_out:
|
||||
sleep(0.01)
|
||||
f_out.write("dl_id 2\n")
|
||||
self.assertTrue(self.archive._file_changed())
|
||||
self.assertFalse("dl_id 2" in self.archive._data)
|
||||
self.assertTrue("dl_id 2" in self.archive)
|
||||
|
||||
def test_archive_multiple_writes(self):
|
||||
self.archive.record_download("dl_id 1")
|
||||
self.archive.record_download("dl_id 2")
|
||||
self.archive.record_download("dl_id 3")
|
||||
expected = "dl_id 1" + "\n" + "dl_id 2" + "\n" + "dl_id 3" + "\n"
|
||||
with open(self.archive.filepath, "r", encoding="utf-8") as f_in:
|
||||
self.assertEqual(f_in.read(), expected)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -28,6 +28,7 @@ import random
|
||||
|
||||
from string import ascii_letters
|
||||
|
||||
from .archive import Archive
|
||||
from .compat import (
|
||||
compat_basestring,
|
||||
compat_cookiejar,
|
||||
@ -416,6 +417,8 @@ class YoutubeDL(object):
|
||||
'Parameter outtmpl is bytes, but should be a unicode string. '
|
||||
'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
|
||||
|
||||
self.archive = Archive(self.params.get('download_archive'))
|
||||
|
||||
self._setup_opener()
|
||||
|
||||
if auto_init:
|
||||
@ -2094,32 +2097,15 @@ class YoutubeDL(object):
|
||||
return extractor.lower() + ' ' + video_id
|
||||
|
||||
def in_download_archive(self, info_dict):
|
||||
fn = self.params.get('download_archive')
|
||||
if fn is None:
|
||||
return False
|
||||
|
||||
vid_id = self._make_archive_id(info_dict)
|
||||
if not vid_id:
|
||||
return False # Incomplete video information
|
||||
|
||||
try:
|
||||
with locked_file(fn, 'r', encoding='utf-8') as archive_file:
|
||||
for line in archive_file:
|
||||
if line.strip() == vid_id:
|
||||
return True
|
||||
except IOError as ioe:
|
||||
if ioe.errno != errno.ENOENT:
|
||||
raise
|
||||
return False
|
||||
return vid_id in self.archive
|
||||
|
||||
def record_download_archive(self, info_dict):
|
||||
fn = self.params.get('download_archive')
|
||||
if fn is None:
|
||||
return
|
||||
vid_id = self._make_archive_id(info_dict)
|
||||
assert vid_id
|
||||
with locked_file(fn, 'a', encoding='utf-8') as archive_file:
|
||||
archive_file.write(vid_id + '\n')
|
||||
self.archive.record_download(vid_id)
|
||||
|
||||
@staticmethod
|
||||
def format_resolution(format, default='unknown'):
|
||||
|
78
youtube_dl/archive.py
Normal file
78
youtube_dl/archive.py
Normal file
@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
import errno
|
||||
import os
|
||||
|
||||
from youtube_dl.compat import compat_st_mtime
|
||||
from youtube_dl.utils import locked_file
|
||||
|
||||
|
||||
class Archive(object):
|
||||
""" Class that manages the download Archive. Provides optimizations to avoid
|
||||
excessive file parsing.
|
||||
|
||||
Initializing options:
|
||||
filepath: str The filepath of the archive.
|
||||
|
||||
Properties:
|
||||
data: set Container for holding a cache of the archive
|
||||
disabled: bool When true, all functions are effectively void
|
||||
filepath: str The filepath of the archive
|
||||
|
||||
Public Methods:
|
||||
__contains__ Checks a video id (archive id) exists in archive
|
||||
record_download Adds a new download to archive
|
||||
|
||||
Private Methods:
|
||||
_file_changed Checks if file has been modified since last read
|
||||
_read_archive Reads archive from disk
|
||||
"""
|
||||
|
||||
def __init__(self, filepath):
|
||||
""" Instantiate Archive
|
||||
filepath: str or None. The filepath of the archive. If None is provided
|
||||
instance can still be used but it's effectively
|
||||
doing nothing."""
|
||||
|
||||
self.filepath = None if filepath == "" else filepath
|
||||
self._disabled = self.filepath is None
|
||||
self._data = set()
|
||||
self._last_read = 0
|
||||
|
||||
def record_download(self, archive_id):
|
||||
""" Records a new archive_id in the archive """
|
||||
if self._disabled:
|
||||
return
|
||||
|
||||
with locked_file(self.filepath, "a", encoding="utf-8") as file_out:
|
||||
file_out.write(archive_id + "\n")
|
||||
self._last_read = compat_st_mtime(self.filepath)
|
||||
self._data.add(archive_id)
|
||||
|
||||
def _file_changed(self):
|
||||
""" Checks if file has been modified, using system Modification Date """
|
||||
if os.path.exists(self.filepath):
|
||||
return self._last_read < compat_st_mtime(self.filepath)
|
||||
return True
|
||||
|
||||
def _read_file(self):
|
||||
""" Reads the data from archive file """
|
||||
if self._disabled or not self._file_changed():
|
||||
return
|
||||
try:
|
||||
with locked_file(self.filepath, "r", encoding="utf-8") as file_in:
|
||||
self._data.update(line.strip() for line in file_in)
|
||||
self._last_read = compat_st_mtime(self.filepath)
|
||||
except IOError as err:
|
||||
if err.errno != errno.ENOENT:
|
||||
raise
|
||||
|
||||
def __contains__(self, item):
|
||||
if not isinstance(item, str):
|
||||
raise ValueError(
|
||||
"An archive contains only strings. Provided {}".format(type(item))
|
||||
)
|
||||
|
||||
if item not in self._data:
|
||||
self._read_file()
|
||||
return item in self._data
|
@ -2978,6 +2978,14 @@ else:
|
||||
return ctypes.WINFUNCTYPE(*args, **kwargs)
|
||||
|
||||
|
||||
def compat_st_mtime(path):
|
||||
""" Py3 has nanosecond accuracy on modification time of file """
|
||||
try:
|
||||
return os.stat(path).st_mtime_ns
|
||||
except ImportError:
|
||||
return os.stat(path).st_mtime
|
||||
|
||||
|
||||
__all__ = [
|
||||
'compat_HTMLParseError',
|
||||
'compat_HTMLParser',
|
||||
@ -3014,6 +3022,7 @@ __all__ = [
|
||||
'compat_shlex_quote',
|
||||
'compat_shlex_split',
|
||||
'compat_socket_create_connection',
|
||||
'compat_st_mtime',
|
||||
'compat_str',
|
||||
'compat_struct_pack',
|
||||
'compat_struct_unpack',
|
||||
|
Loading…
Reference in New Issue
Block a user