From 684ea0385c2c649d2cc56c185734ed989d0468a6 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 11 Oct 2020 10:03:25 -0700 Subject: [PATCH 01/32] Move backup to a non-associated function Give it its own loop for walking the tree, so that it can combine small files. --- src/archive.rs | 18 ------------ src/backup.rs | 67 +++++++++++++++++++++++++++++++++++++++++++ src/bin/conserve.rs | 6 ++-- src/gc_lock.rs | 6 ++-- src/lib.rs | 3 +- src/test_fixtures.rs | 4 +-- tests/gc.rs | 8 ++---- tests/integration.rs | 38 ++++++++++-------------- tests/old_archives.rs | 9 ++++-- 9 files changed, 100 insertions(+), 59 deletions(-) diff --git a/src/archive.rs b/src/archive.rs index 011b868d..3f8d0075 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -21,7 +21,6 @@ use std::sync::Mutex; use rayon::prelude::*; use serde::{Deserialize, Serialize}; -use crate::backup::BackupOptions; use crate::blockhash::BlockHash; use crate::copy_tree::CopyOptions; use crate::errors::Error; @@ -115,23 +114,6 @@ impl Archive { }) } - /// Backup a source directory into a new band in the archive. - /// - /// Returns statistics about what was copied. - pub fn backup(&self, source_path: &Path, options: &BackupOptions) -> Result { - let live_tree = LiveTree::open(source_path)?.with_excludes(options.excludes.clone()); - let writer = BackupWriter::begin(self)?; - copy_tree( - &live_tree, - writer, - &CopyOptions { - print_filenames: options.print_filenames, - measure_first: false, - ..CopyOptions::default() - }, - ) - } - /// Restore a selected version, or by default the latest, to a destination directory. pub fn restore(&self, destination_path: &Path, options: &RestoreOptions) -> Result { let st = self.open_stored_tree(options.band_selection.clone())?; diff --git a/src/backup.rs b/src/backup.rs index f88898c8..cd822ce0 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -19,8 +19,75 @@ use globset::GlobSet; use crate::blockdir::StoreFiles; use crate::index::IndexEntryIter; use crate::stats::CopyStats; +use crate::tree::ReadTree; use crate::*; +/// Backup a source directory into a new band in the archive. +/// +/// Returns statistics about what was copied. +pub fn backup(archive: &Archive, source: &LiveTree, options: &BackupOptions) -> Result { + let mut writer = BackupWriter::begin(archive)?; + let mut stats = CopyStats::default(); + let mut progress_bar = ProgressBar::new(); + // This causes us to walk the source tree twice, which is probably an acceptable option + // since it's nice to see realistic overall progress. We could keep all the entries + // in memory, and maybe we should, but it might get unreasonably big. + + // if options.measure_first { + // progress_bar.set_phase("Measure source tree".to_owned()); + // // TODO: Maybe read all entries for the source tree in to memory now, rather than walking it + // // again a second time? But, that'll potentially use memory proportional to tree size, which + // // I'd like to avoid, and also perhaps make it more likely we grumble about files that were + // // deleted or changed while this is running. + // progress_bar.set_bytes_total(source.size()?.file_bytes as u64); + // } + + progress_bar.set_phase("Copying".to_owned()); + let entry_iter = source.iter_entries()?; + // let entry_iter: Box> = match &options.only_subtree { + // None => Box::new(source.iter_entries()?), + // Some(subtree) => source.iter_subtree_entries(subtree)?, + // }; + for entry in entry_iter { + if options.print_filenames { + crate::ui::println(entry.apath()); + } + progress_bar.set_filename(entry.apath().to_string()); + if let Err(e) = match entry.kind() { + Kind::Dir => { + stats.directories += 1; + writer.copy_dir(&entry) + } + Kind::File => { + stats.files += 1; + let result = writer.copy_file(&entry, source).map(|s| stats += s); + if let Some(bytes) = entry.size() { + progress_bar.increment_bytes_done(bytes); + } + result + } + Kind::Symlink => { + stats.symlinks += 1; + writer.copy_symlink(&entry) + } + Kind::Unknown => { + stats.unknown_kind += 1; + // TODO: Perhaps eventually we could backup and restore pipes, + // sockets, etc. Or at least count them. For now, silently skip. + // https://github.com/sourcefrog/conserve/issues/82 + continue; + } + } { + ui::show_error(&e); + stats.errors += 1; + continue; + } + } + stats += writer.finish()?; + // TODO: Merge in stats from the tree iter and maybe the source tree? + Ok(stats) +} + /// Configuration of how to make a backup. #[derive(Debug)] pub struct BackupOptions { diff --git a/src/bin/conserve.rs b/src/bin/conserve.rs index 9dd57689..c7fed12f 100644 --- a/src/bin/conserve.rs +++ b/src/bin/conserve.rs @@ -196,11 +196,13 @@ impl Command { verbose, exclude, } => { + let excludes = excludes::from_strings(exclude)?; + let source = &LiveTree::open(source)?.with_excludes(excludes.clone()); let options = BackupOptions { print_filenames: *verbose, - excludes: excludes::from_strings(exclude)?, + excludes, }; - let copy_stats = Archive::open_path(archive)?.backup(source, &options)?; + let copy_stats = backup(&Archive::open_path(archive)?, &source, &options)?; ui::println("Backup complete."); copy_stats.summarize_backup(&mut stdout); } diff --git a/src/gc_lock.rs b/src/gc_lock.rs index e6df80bc..6a944ea4 100644 --- a/src/gc_lock.rs +++ b/src/gc_lock.rs @@ -126,9 +126,7 @@ mod test { fn completed_backup_ok() { let archive = ScratchArchive::new(); let source = TreeFixture::new(); - archive - .backup(&source.path(), &BackupOptions::default()) - .unwrap(); + backup(&archive, &source.live_tree(), &BackupOptions::default()).unwrap(); let delete_guard = GarbageCollectionLock::new(&archive).unwrap(); delete_guard.check().unwrap(); } @@ -138,7 +136,7 @@ mod test { let archive = ScratchArchive::new(); let source = TreeFixture::new(); let _delete_guard = GarbageCollectionLock::new(&archive).unwrap(); - let backup_result = archive.backup(&source.path(), &BackupOptions::default()); + let backup_result = backup(&archive, &source.live_tree(), &BackupOptions::default()); assert_eq!( backup_result.err().expect("backup fails").to_string(), "Archive is locked for garbage collection" diff --git a/src/lib.rs b/src/lib.rs index f8c14216..742104ab 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -49,8 +49,7 @@ pub mod unix_time; pub use crate::apath::Apath; pub use crate::archive::Archive; pub use crate::archive::DeleteOptions; -pub use crate::backup::BackupOptions; -pub use crate::backup::BackupWriter; +pub use crate::backup::{backup, BackupOptions}; pub use crate::band::Band; pub use crate::band::BandSelectionPolicy; pub use crate::bandid::BandId; diff --git a/src/test_fixtures.rs b/src/test_fixtures.rs index 8c4b73ad..db960ec0 100644 --- a/src/test_fixtures.rs +++ b/src/test_fixtures.rs @@ -65,10 +65,10 @@ impl ScratchArchive { } let options = &BackupOptions::default(); - self.archive.backup(&srcdir.path(), &options).unwrap(); + backup(&self.archive, &srcdir.live_tree(), &options).unwrap(); srcdir.create_file("hello2"); - self.archive.backup(&srcdir.path(), &options).unwrap(); + backup(&self.archive, &srcdir.live_tree(), &options).unwrap(); } } diff --git a/tests/gc.rs b/tests/gc.rs index b00d795d..4bb1f4ed 100644 --- a/tests/gc.rs +++ b/tests/gc.rs @@ -26,9 +26,7 @@ fn unreferenced_blocks() { .parse() .unwrap(); - let _copy_stats = archive - .backup(&tf.path(), &BackupOptions::default()) - .expect("backup"); + let _copy_stats = backup(&archive, &tf.live_tree(), &BackupOptions::default()).expect("backup"); // Delete the band and index std::fs::remove_dir_all(archive.path().join("b0000")).unwrap(); @@ -96,7 +94,7 @@ fn backup_prevented_by_gc_lock() -> Result<()> { let lock1 = GarbageCollectionLock::new(&archive)?; // Backup should fail while gc lock is held. - let backup_result = archive.backup(&tf.path(), &BackupOptions::default()); + let backup_result = backup(&archive, &tf.live_tree(), &BackupOptions::default()); match backup_result { Err(Error::GarbageCollectionLockHeld) => (), other => panic!("unexpected result {:?}", other), @@ -110,7 +108,7 @@ fn backup_prevented_by_gc_lock() -> Result<()> { })?; // Backup should now succeed. - let backup_result = archive.backup(&tf.path(), &BackupOptions::default()); + let backup_result = backup(&archive, &tf.live_tree(), &BackupOptions::default()); assert!(backup_result.is_ok()); Ok(()) diff --git a/tests/integration.rs b/tests/integration.rs index 9e2cc68c..cde13680 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -34,9 +34,7 @@ pub fn simple_backup() { let srcdir = TreeFixture::new(); srcdir.create_file("hello"); // TODO: Include a symlink only on Unix. - let copy_stats = af - .backup(&srcdir.path(), &BackupOptions::default()) - .expect("backup"); + let copy_stats = backup(&af, &srcdir.live_tree(), &BackupOptions::default()).expect("backup"); assert_eq!(copy_stats.index_builder_stats.index_hunks, 1); assert_eq!(copy_stats.files, 1); check_backup(&af); @@ -64,11 +62,12 @@ pub fn simple_backup_with_excludes() -> Result<()> { srcdir.create_file("baz"); // TODO: Include a symlink only on Unix. let excludes = excludes::from_strings(&["/**/baz", "/**/bar", "/**/fooo*"]).unwrap(); + let source = srcdir.live_tree().with_excludes(excludes.clone()); let options = BackupOptions { excludes, ..BackupOptions::default() }; - let copy_stats = af.backup(&srcdir.path(), &options).expect("backup"); + let copy_stats = backup(&af, &source, &options).expect("backup"); check_backup(&af); @@ -120,11 +119,12 @@ pub fn backup_more_excludes() { srcdir.create_file("bar"); let excludes = excludes::from_strings(&["/**/foo*", "/**/baz"]).unwrap(); + let source = srcdir.live_tree().with_excludes(excludes.clone()); let options = BackupOptions { excludes, print_filenames: false, }; - let stats = af.backup(&srcdir.path(), &options).expect("backup"); + let stats = backup(&af, &source, &options).expect("backup"); assert_eq!(1, stats.written_blocks); assert_eq!(1, stats.files); @@ -186,9 +186,7 @@ fn large_file() { let tf = TreeFixture::new(); let large_content = String::from("abcd").repeat(1 << 20); tf.create_file_with_contents("large", &large_content.as_bytes()); - let copy_stats = af - .backup(&tf.path(), &BackupOptions::default()) - .expect("backup"); + let copy_stats = backup(&af, &tf.live_tree(), &BackupOptions::default()).expect("backup"); assert_eq!(copy_stats.new_files, 1); // First 1MB should be new; remainder should be deduplicated. assert_eq!(copy_stats.uncompressed_bytes, 1 << 20); @@ -227,9 +225,7 @@ fn source_unreadable() { tf.make_file_unreadable("b_unreadable"); - let stats = af - .backup(&tf.path(), &BackupOptions::default()) - .expect("backup"); + let stats = backup(&af, &tf.live_tree(), &BackupOptions::default()).expect("backup"); assert_eq!(stats.errors, 1); assert_eq!(stats.new_files, 2); assert_eq!(stats.files, 3); @@ -256,7 +252,7 @@ fn mtime_before_epoch() { dbg!(&entries[1].mtime()); let af = ScratchArchive::new(); - af.backup(&tf.path(), &BackupOptions::default()) + backup(&af, &tf.live_tree(), &BackupOptions::default()) .expect("backup shouldn't crash on before-epoch mtimes"); } @@ -266,9 +262,7 @@ pub fn symlink() { let af = ScratchArchive::new(); let srcdir = TreeFixture::new(); srcdir.create_symlink("symlink", "/a/broken/destination"); - let copy_stats = af - .backup(&srcdir.path(), &BackupOptions::default()) - .expect("backup"); + let copy_stats = backup(&af, &srcdir.live_tree(), &BackupOptions::default()).expect("backup"); assert_eq!(0, copy_stats.files); assert_eq!(1, copy_stats.symlinks); @@ -297,9 +291,7 @@ pub fn empty_file_uses_zero_blocks() { let af = ScratchArchive::new(); let srcdir = TreeFixture::new(); srcdir.create_file_with_contents("empty", &[]); - let stats = af - .backup(&srcdir.path(), &BackupOptions::default()) - .unwrap(); + let stats = backup(&af, &srcdir.live_tree(), &BackupOptions::default()).unwrap(); assert_eq!(1, stats.files); assert_eq!(stats.written_blocks, 0); @@ -332,7 +324,7 @@ pub fn detect_unmodified() { srcdir.create_file("bbb"); let options = BackupOptions::default(); - let stats = af.backup(&srcdir.path(), &options).unwrap(); + let stats = backup(&af, &srcdir.live_tree(), &options).unwrap(); assert_eq!(stats.files, 2); assert_eq!(stats.new_files, 2); @@ -340,7 +332,7 @@ pub fn detect_unmodified() { // Make a second backup from the same tree, and we should see that // both files are unmodified. - let stats = af.backup(&srcdir.path(), &options).unwrap(); + let stats = backup(&af, &srcdir.live_tree(), &options).unwrap(); assert_eq!(stats.files, 2); assert_eq!(stats.new_files, 0); @@ -350,7 +342,7 @@ pub fn detect_unmodified() { // as unmodified. srcdir.create_file_with_contents("bbb", b"longer content for bbb"); - let stats = af.backup(&srcdir.path(), &options).unwrap(); + let stats = backup(&af, &srcdir.live_tree(), &options).unwrap(); assert_eq!(stats.files, 2); assert_eq!(stats.new_files, 0); @@ -366,7 +358,7 @@ pub fn detect_minimal_mtime_change() { srcdir.create_file_with_contents("bbb", b"longer content for bbb"); let options = BackupOptions::default(); - let stats = af.backup(&srcdir.path(), &options).unwrap(); + let stats = backup(&af, &srcdir.live_tree(), &options).unwrap(); assert_eq!(stats.files, 2); assert_eq!(stats.new_files, 2); @@ -388,7 +380,7 @@ pub fn detect_minimal_mtime_change() { } } - let stats = af.backup(&srcdir.path(), &options).unwrap(); + let stats = backup(&af, &srcdir.live_tree(), &options).unwrap(); assert_eq!(stats.files, 2); assert_eq!(stats.unmodified_files, 1); } diff --git a/tests/old_archives.rs b/tests/old_archives.rs index 13437aac..086d144a 100644 --- a/tests/old_archives.rs +++ b/tests/old_archives.rs @@ -117,9 +117,12 @@ fn restore_modify_backup() { .expect("overwrite file"); let new_archive = Archive::open_path(&new_archive_path).expect("Open new archive"); - let backup_stats = new_archive - .backup(&working_tree.path(), &BackupOptions::default()) - .expect("Backup modified tree"); + let backup_stats = backup( + &new_archive, + &LiveTree::open(working_tree.path()).unwrap(), + &BackupOptions::default(), + ) + .expect("Backup modified tree"); assert_eq!(backup_stats.files, 3); // unmodified_files should be 0, but the unchanged file is s not defected as unmodified, From d7ec2191f93d76b396b671bf01e7bc6d1e3793c8 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 11 Oct 2020 10:37:14 -0700 Subject: [PATCH 02/32] Combine subtree and exclusions to ReadTree:iter_filtered --- src/copy_tree.rs | 2 +- src/live_tree.rs | 10 +++++++--- src/stored_tree.rs | 17 ++--------------- src/tree.rs | 18 +++++++++++------- 4 files changed, 21 insertions(+), 26 deletions(-) diff --git a/src/copy_tree.rs b/src/copy_tree.rs index b8040536..f46e0b3c 100644 --- a/src/copy_tree.rs +++ b/src/copy_tree.rs @@ -51,7 +51,7 @@ pub fn copy_tree( progress_bar.set_phase("Copying".to_owned()); let entry_iter: Box> = match &options.only_subtree { None => Box::new(source.iter_entries()?), - Some(subtree) => source.iter_subtree_entries(subtree)?, + Some(subtree) => source.iter_filtered(subtree, &excludes::excludes_nothing())?, }; for entry in entry_iter { if options.print_filenames { diff --git a/src/live_tree.rs b/src/live_tree.rs index 7434bff2..ca336637 100644 --- a/src/live_tree.rs +++ b/src/live_tree.rs @@ -84,11 +84,15 @@ impl tree::ReadTree for LiveTree { Ok(Box::new(Iter::new(&self.path, &self.excludes)?)) } - fn iter_subtree_entries(&self, subtree: &Apath) -> Result>> { + fn iter_filtered( + &self, + subtree: &Apath, + excludes: &GlobSet, + ) -> Result>> { // TODO: Just skip directly to the requested directory, and stop when it's done. let subtree = subtree.to_owned(); Ok(Box::new( - self.iter_entries()? + Iter::new(&self.path, excludes)? .filter(move |entry| subtree.is_prefix_of(entry.apath())), )) } @@ -464,7 +468,7 @@ mod tests { let lt = LiveTree::open(tf.path()).unwrap(); let names: Vec = lt - .iter_subtree_entries(&"/subdir".into()) + .iter_filtered(&"/subdir".into(), &excludes::excludes_nothing()) .unwrap() .map(|entry| entry.apath.into()) .collect(); diff --git a/src/stored_tree.rs b/src/stored_tree.rs index 300db1ad..5d8a65b8 100644 --- a/src/stored_tree.rs +++ b/src/stored_tree.rs @@ -111,19 +111,6 @@ impl ReadTree for StoredTree { )) } - fn iter_subtree_entries( - &self, - subtree: &Apath, - ) -> Result>> { - // TODO: Advance in the index to the requested directory, and stop immediately when it's - // done. - let subtree = subtree.to_owned(); - Ok(Box::new( - self.iter_entries()? - .filter(move |entry| subtree.is_prefix_of(entry.apath())), - )) - } - fn file_contents(&self, entry: &Self::Entry) -> Result { Ok(self.open_stored_file(entry)?.into_read()) } @@ -177,14 +164,14 @@ mod test { } #[test] - fn iter_subtree_entries() { + fn iter_filtered() { let archive = Archive::open_path(Path::new("testdata/archive/v0.6.3/minimal-1/")).unwrap(); let st = archive .open_stored_tree(BandSelectionPolicy::Latest) .unwrap(); let names: Vec = st - .iter_subtree_entries(&"/subdir".into()) + .iter_filtered(&"/subdir".into(), &excludes::excludes_nothing()) .unwrap() .map(|entry| entry.apath.into()) .collect(); diff --git a/src/tree.rs b/src/tree.rs index c391fe99..9fa9e82b 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -21,7 +21,7 @@ use crate::*; /// Abstract Tree that may be either on the real filesystem or stored in an archive. pub trait ReadTree { // TODO: Perhaps hide these and just return dyn objects? - type Entry: Entry; + type Entry: Entry + 'static; type R: std::io::Read; /// Iterate, in apath order, all the entries in this tree. @@ -31,14 +31,18 @@ pub trait ReadTree { /// iterator. fn iter_entries(&self) -> Result>>; - /// Iterate, in apath order, the entries from a subtree. - /// - /// The provided implementation iterates and filters all entries, but implementations - /// may be able to do better. - fn iter_subtree_entries( + /// Return entries within the subtree and not excluded. + fn iter_filtered( &self, subtree: &Apath, - ) -> Result>>; + excludes: &GlobSet, + ) -> Result>> { + let subtree = subtree.to_owned(); + let excludes = excludes.clone(); + Ok(Box::new(self.iter_entries()?.filter(move |entry| { + subtree.is_prefix_of(entry.apath()) && !excludes.is_match(entry.apath()) + }))) + } /// Read file contents as a `std::io::Read`. // TODO: Remove this and use ReadBlocks or similar. From fa734dc8305e0baaad138d8d6317ab8b23b9ed36 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 11 Oct 2020 15:21:23 -0700 Subject: [PATCH 03/32] Apply exclusions during iterator not in the tree To iterate a local tree's subtree, start directly there and descend, rather than walking the whole thing and throwing data away. MergeTrees works on entry iterators not trees. --- src/archive.rs | 2 +- src/backup.rs | 6 +---- src/bin/conserve.rs | 48 +++++++++++++++++++------------------ src/copy_tree.rs | 7 +++--- src/lib.rs | 2 +- src/live_tree.rs | 56 ++++++++++++++++++-------------------------- src/merge.rs | 55 ++++++++++++++++++++++++++----------------- src/output.rs | 4 ++-- src/stored_tree.rs | 12 ++-------- src/tree.rs | 15 ++++++++---- tests/integration.rs | 4 ++-- 11 files changed, 103 insertions(+), 108 deletions(-) diff --git a/src/archive.rs b/src/archive.rs index 3f8d0075..0634821d 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -117,7 +117,6 @@ impl Archive { /// Restore a selected version, or by default the latest, to a destination directory. pub fn restore(&self, destination_path: &Path, options: &RestoreOptions) -> Result { let st = self.open_stored_tree(options.band_selection.clone())?; - let st = st.with_excludes(options.excludes.clone()); let rt = if options.overwrite { RestoreTree::create_overwrite(destination_path) } else { @@ -126,6 +125,7 @@ impl Archive { let opts = CopyOptions { print_filenames: options.print_filenames, only_subtree: options.only_subtree.clone(), + excludes: Some(options.excludes.clone()), ..CopyOptions::default() }; copy_tree(&st, rt, &opts) diff --git a/src/backup.rs b/src/backup.rs index cd822ce0..d7bfa2ac 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -43,11 +43,7 @@ pub fn backup(archive: &Archive, source: &LiveTree, options: &BackupOptions) -> // } progress_bar.set_phase("Copying".to_owned()); - let entry_iter = source.iter_entries()?; - // let entry_iter: Box> = match &options.only_subtree { - // None => Box::new(source.iter_entries()?), - // Some(subtree) => source.iter_subtree_entries(subtree)?, - // }; + let entry_iter = source.iter_filtered(None, Some(options.excludes.clone()))?; for entry in entry_iter { if options.print_filenames { crate::ui::println(entry.apath()); diff --git a/src/bin/conserve.rs b/src/bin/conserve.rs index c7fed12f..ec245b4c 100644 --- a/src/bin/conserve.rs +++ b/src/bin/conserve.rs @@ -197,7 +197,7 @@ impl Command { exclude, } => { let excludes = excludes::from_strings(exclude)?; - let source = &LiveTree::open(source)?.with_excludes(excludes.clone()); + let source = &LiveTree::open(source)?; let options = BackupOptions { print_filenames: *verbose, excludes, @@ -213,7 +213,7 @@ impl Command { } } Command::Debug(Debug::Index { archive, backup }) => { - let st = stored_tree_from_opt(archive, &backup, &Vec::new())?; + let st = stored_tree_from_opt(archive, &backup)?; output::show_index_json(&st.band(), &mut stdout)?; } Command::Debug(Debug::Referenced { archive }) => { @@ -255,9 +255,15 @@ impl Command { // TODO: Summarize diff. // TODO: Optionally include unchanged files. let excludes = excludes::from_strings(exclude)?; - let st = stored_tree_from_opt(archive, backup, exclude)?; - let lt = LiveTree::open(source)?.with_excludes(excludes); - output::show_tree_diff(&mut conserve::iter_merged_entries(&st, <)?, &mut stdout)?; + let st = stored_tree_from_opt(archive, backup)?; + let lt = LiveTree::open(source)?; + output::show_tree_diff( + &mut MergeTrees::new( + st.iter_filtered(None, Some(excludes.clone()))?, + lt.iter_filtered(None, Some(excludes.clone()))?, + ), + &mut stdout, + )?; } Command::Gc { archive, @@ -277,14 +283,17 @@ impl Command { ui::println(&format!("Created new archive in {:?}", &archive)); } Command::Ls { stos } => { + let excludes = Some(excludes::from_strings(&stos.exclude)?); if let Some(archive) = &stos.archive { - output::show_tree_names( - &stored_tree_from_opt(archive, &stos.backup, &stos.exclude)?, + output::show_entry_names( + stored_tree_from_opt(archive, &stos.backup)? + .iter_filtered(None, excludes)?, &mut stdout, )?; } else { - output::show_tree_names( - &live_tree_from_opt(stos.source.as_ref().unwrap(), &stos.exclude)?, + output::show_entry_names( + LiveTree::open(stos.source.clone().unwrap())? + .iter_filtered(None, excludes)?, &mut stdout, )?; } @@ -314,12 +323,15 @@ impl Command { copy_stats.summarize_restore(&mut stdout)?; } Command::Size { ref stos } => { + if !stos.exclude.is_empty() { + todo!("size with exclusions not implemented") + } let size = if let Some(archive) = &stos.archive { - stored_tree_from_opt(archive, &stos.backup, &stos.exclude)? + stored_tree_from_opt(archive, &stos.backup)? .size()? .file_bytes } else { - live_tree_from_opt(stos.source.as_ref().unwrap(), &stos.exclude)? + LiveTree::open(stos.source.as_ref().unwrap())? .size()? .file_bytes }; @@ -353,16 +365,10 @@ impl Command { } } -fn stored_tree_from_opt( - archive: &Path, - backup: &Option, - exclude: &[String], -) -> Result { +fn stored_tree_from_opt(archive: &Path, backup: &Option) -> Result { let archive = Archive::open_path(archive)?; let policy = band_selection_policy_from_opt(backup); - Ok(archive - .open_stored_tree(policy)? - .with_excludes(excludes::from_strings(exclude)?)) + archive.open_stored_tree(policy) } fn band_selection_policy_from_opt(backup: &Option) -> BandSelectionPolicy { @@ -373,10 +379,6 @@ fn band_selection_policy_from_opt(backup: &Option) -> BandSelectionPolic } } -fn live_tree_from_opt(source: &Path, exclude: &[String]) -> Result { - Ok(LiveTree::open(source)?.with_excludes(excludes::from_strings(exclude)?)) -} - fn main() { ui::enable_progress(true); let result = Command::from_args().run(); diff --git a/src/copy_tree.rs b/src/copy_tree.rs index f46e0b3c..05945281 100644 --- a/src/copy_tree.rs +++ b/src/copy_tree.rs @@ -23,6 +23,7 @@ pub struct CopyOptions { pub measure_first: bool, /// Copy only this subtree from the source. pub only_subtree: Option, + pub excludes: Option, } /// Copy files and other entries from one tree to another. @@ -49,10 +50,8 @@ pub fn copy_tree( } progress_bar.set_phase("Copying".to_owned()); - let entry_iter: Box> = match &options.only_subtree { - None => Box::new(source.iter_entries()?), - Some(subtree) => source.iter_filtered(subtree, &excludes::excludes_nothing())?, - }; + let entry_iter: Box> = + source.iter_filtered(options.only_subtree.clone(), options.excludes.clone())?; for entry in entry_iter { if options.print_filenames { crate::ui::println(entry.apath()); diff --git a/src/lib.rs b/src/lib.rs index 742104ab..6766999d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -62,7 +62,7 @@ pub use crate::gc_lock::GarbageCollectionLock; pub use crate::index::{IndexBuilder, IndexEntry, IndexRead}; pub use crate::kind::Kind; pub use crate::live_tree::{LiveEntry, LiveTree}; -pub use crate::merge::{iter_merged_entries, MergedEntryKind}; +pub use crate::merge::{MergeTrees, MergedEntryKind}; pub use crate::misc::bytes_to_human_mb; pub use crate::progress::ProgressBar; pub use crate::restore::{RestoreOptions, RestoreTree}; diff --git a/src/live_tree.rs b/src/live_tree.rs index ca336637..425e2062 100644 --- a/src/live_tree.rs +++ b/src/live_tree.rs @@ -30,7 +30,6 @@ use crate::*; #[derive(Clone)] pub struct LiveTree { path: PathBuf, - excludes: GlobSet, } impl LiveTree { @@ -38,17 +37,9 @@ impl LiveTree { // TODO: Maybe fail here if the root doesn't exist or isn't a directory? Ok(LiveTree { path: path.as_ref().to_path_buf(), - excludes: excludes::excludes_nothing(), }) } - /// Return a new LiveTree which when listed will ignore certain files. - /// - /// This replaces any previous exclusions. - pub fn with_excludes(self, excludes: GlobSet) -> LiveTree { - LiveTree { excludes, ..self } - } - fn relative_path(&self, apath: &Apath) -> PathBuf { relative_path(&self.path, apath) } @@ -64,8 +55,8 @@ pub struct LiveEntry { symlink_target: Option, } -fn relative_path(root: &PathBuf, apath: &Apath) -> PathBuf { - let mut path = root.clone(); +fn relative_path(root: &Path, apath: &Apath) -> PathBuf { + let mut path = root.to_path_buf(); path.push(&apath[1..]); path } @@ -81,20 +72,15 @@ impl tree::ReadTree for LiveTree { /// child directories, visit them according to a sorted comparison by their UTF-8 /// name. fn iter_entries(&self) -> Result>> { - Ok(Box::new(Iter::new(&self.path, &self.excludes)?)) + Ok(Box::new(Iter::new(&self.path, None, None)?)) } fn iter_filtered( &self, - subtree: &Apath, - excludes: &GlobSet, + subtree: Option, + excludes: Option, ) -> Result>> { - // TODO: Just skip directly to the requested directory, and stop when it's done. - let subtree = subtree.to_owned(); - Ok(Box::new( - Iter::new(&self.path, excludes)? - .filter(move |entry| subtree.is_prefix_of(entry.apath())), - )) + Ok(Box::new(Iter::new(&self.path, subtree, excludes)?)) } fn file_contents(&self, entry: &LiveEntry) -> Result { @@ -176,7 +162,7 @@ pub struct Iter { check_order: apath::CheckOrder, /// glob pattern to skip in iterator - excludes: GlobSet, + excludes: Option, stats: LiveTreeIterStats, } @@ -184,25 +170,27 @@ pub struct Iter { impl Iter { /// Construct a new iter that will visit everything below this root path, /// subject to some exclusions - fn new(root_path: &Path, excludes: &GlobSet) -> Result { - let root_metadata = fs::symlink_metadata(&root_path).map_err(Error::from)?; + fn new(root_path: &Path, subtree: Option, excludes: Option) -> Result { + let subtree = subtree.unwrap_or("/".into()); + let start_path = relative_path(root_path, &subtree); + let start_metadata = fs::symlink_metadata(&start_path).map_err(Error::from)?; // Preload iter to return the root and then recurse into it. let mut entry_deque = VecDeque::::new(); entry_deque.push_back(LiveEntry::from_fs_metadata( - Apath::from("/"), - &root_metadata, + subtree.clone(), + &start_metadata, None, )); // TODO: Consider the case where the root is not actually a directory? // Should that be supported? let mut dir_deque = VecDeque::::new(); - dir_deque.push_back("/".into()); + dir_deque.push_back(subtree); Ok(Iter { root_path: root_path.to_path_buf(), entry_deque, dir_deque, check_order: apath::CheckOrder::new(), - excludes: excludes.clone(), + excludes, stats: LiveTreeIterStats::default(), }) } @@ -267,9 +255,11 @@ impl Iter { } }; - if self.excludes.is_match(&child_apath_str) { - self.stats.exclusions += 1; - continue; + if let Some(ref excludes) = self.excludes { + if excludes.is_match(&child_apath_str) { + self.stats.exclusions += 1; + continue; + } } let metadata = match dir_entry.metadata() { Ok(metadata) => metadata, @@ -427,8 +417,8 @@ mod tests { let excludes = excludes::from_strings(&["/**/fooo*", "/**/ba[pqr]", "/**/*bas"]).unwrap(); - let lt = LiveTree::open(tf.path()).unwrap().with_excludes(excludes); - let mut source_iter = lt.iter_entries().unwrap(); + let lt = LiveTree::open(tf.path()).unwrap(); + let mut source_iter = lt.iter_filtered(None, Some(excludes)).unwrap(); let result = source_iter.by_ref().collect::>(); // First one is the root @@ -468,7 +458,7 @@ mod tests { let lt = LiveTree::open(tf.path()).unwrap(); let names: Vec = lt - .iter_filtered(&"/subdir".into(), &excludes::excludes_nothing()) + .iter_filtered(Some("/subdir".into()), None) .unwrap() .map(|entry| entry.apath.into()) .collect(); diff --git a/src/merge.rs b/src/merge.rs index 5c0c36a7..51915b08 100644 --- a/src/merge.rs +++ b/src/merge.rs @@ -41,32 +41,41 @@ pub struct MergedEntry { /// /// Note that at present this only says whether files are absent from either /// side, not whether there is a content difference. -pub fn iter_merged_entries(a: &AT, b: &BT) -> Result> +pub struct MergeTrees where - AT: ReadTree, - BT: ReadTree, + AE: Entry, + BE: Entry, { - Ok(MergeTrees { - ait: a.iter_entries()?, - bit: b.iter_entries()?, - na: None, - nb: None, - }) -} - -pub struct MergeTrees { - ait: Box>, - bit: Box>, + ait: Box>, + bit: Box>, // Read in advance entries from A and B. - na: Option, - nb: Option, + na: Option, + nb: Option, +} + +impl MergeTrees +where + AE: Entry, + BE: Entry, +{ + pub fn new( + ait: Box>, + bit: Box>, + ) -> MergeTrees { + MergeTrees { + ait, + bit, + na: None, + nb: None, + } + } } -impl Iterator for MergeTrees +impl Iterator for MergeTrees where - AT: ReadTree, - BT: ReadTree, + AE: Entry, + BE: Entry, { type Item = MergedEntry; @@ -142,9 +151,11 @@ mod tests { fn merge_entry_trees() { let ta = TreeFixture::new(); let tb = TreeFixture::new(); - let di = iter_merged_entries(&ta.live_tree(), &tb.live_tree()) - .unwrap() - .collect::>(); + let di = MergeTrees::new( + ta.live_tree().iter_entries().unwrap(), + tb.live_tree().iter_entries().unwrap(), + ) + .collect::>(); assert_eq!(di.len(), 1); assert_eq!( di[0], diff --git a/src/output.rs b/src/output.rs index a422ccfc..6a79853c 100644 --- a/src/output.rs +++ b/src/output.rs @@ -94,9 +94,9 @@ pub fn show_index_json(band: &Band, w: &mut dyn Write) -> Result<()> { .map_err(|source| Error::SerializeIndex { source }) } -pub fn show_tree_names(tree: &T, w: &mut dyn Write) -> Result<()> { +pub fn show_entry_names>(it: I, w: &mut dyn Write) -> Result<()> { let mut bw = BufWriter::new(w); - for entry in tree.iter_entries()? { + for entry in it { writeln!(bw, "{}", entry.apath())?; } Ok(()) diff --git a/src/stored_tree.rs b/src/stored_tree.rs index 5d8a65b8..4459da86 100644 --- a/src/stored_tree.rs +++ b/src/stored_tree.rs @@ -30,7 +30,6 @@ pub struct StoredTree { band: Band, archive: Archive, block_dir: BlockDir, - excludes: GlobSet, } impl StoredTree { @@ -38,15 +37,10 @@ impl StoredTree { Ok(StoredTree { band: Band::open(archive, band_id)?, block_dir: archive.block_dir().clone(), - excludes: excludes::excludes_nothing(), archive: archive.clone(), }) } - pub fn with_excludes(self, excludes: GlobSet) -> StoredTree { - StoredTree { excludes, ..self } - } - pub fn band(&self) -> &Band { &self.band } @@ -102,12 +96,10 @@ impl ReadTree for StoredTree { /// Return an iter of index entries in this stored tree. fn iter_entries(&self) -> Result>> { - let excludes = self.excludes.clone(); Ok(Box::new( self.archive .iter_stitched_index_hunks(self.band.id()) - .flatten() - .filter(move |entry| !excludes.is_match(&entry.apath)), + .flatten(), )) } @@ -171,7 +163,7 @@ mod test { .unwrap(); let names: Vec = st - .iter_filtered(&"/subdir".into(), &excludes::excludes_nothing()) + .iter_filtered(Some("/subdir".into()), None) .unwrap() .map(|entry| entry.apath.into()) .collect(); diff --git a/src/tree.rs b/src/tree.rs index 9fa9e82b..281be7ac 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -34,13 +34,18 @@ pub trait ReadTree { /// Return entries within the subtree and not excluded. fn iter_filtered( &self, - subtree: &Apath, - excludes: &GlobSet, + subtree: Option, + excludes: Option, ) -> Result>> { - let subtree = subtree.to_owned(); - let excludes = excludes.clone(); Ok(Box::new(self.iter_entries()?.filter(move |entry| { - subtree.is_prefix_of(entry.apath()) && !excludes.is_match(entry.apath()) + subtree + .as_ref() + .map(|s| s.is_prefix_of(entry.apath())) + .unwrap_or(true) + && excludes + .as_ref() + .map(|e| !e.is_match(entry.apath())) + .unwrap_or(true) }))) } diff --git a/tests/integration.rs b/tests/integration.rs index cde13680..41af18f6 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -62,7 +62,7 @@ pub fn simple_backup_with_excludes() -> Result<()> { srcdir.create_file("baz"); // TODO: Include a symlink only on Unix. let excludes = excludes::from_strings(&["/**/baz", "/**/bar", "/**/fooo*"]).unwrap(); - let source = srcdir.live_tree().with_excludes(excludes.clone()); + let source = srcdir.live_tree(); let options = BackupOptions { excludes, ..BackupOptions::default() @@ -119,7 +119,7 @@ pub fn backup_more_excludes() { srcdir.create_file("bar"); let excludes = excludes::from_strings(&["/**/foo*", "/**/baz"]).unwrap(); - let source = srcdir.live_tree().with_excludes(excludes.clone()); + let source = srcdir.live_tree(); let options = BackupOptions { excludes, print_filenames: false, From 80b18586277b15a99db011712efc0021c96efb2e Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 11 Oct 2020 15:52:48 -0700 Subject: [PATCH 04/32] Support size --exclude and size --bytes --- Cargo.lock | 2 +- NEWS.md | 12 ++++++++++++ src/bin/conserve.rs | 20 +++++++++++++------- src/copy_tree.rs | 2 +- src/output.rs | 2 +- src/tree.rs | 4 ++-- tests/cli.rs | 15 +++++++++++++++ 7 files changed, 45 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a1dedc50..e0ea85b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -147,7 +147,7 @@ dependencies = [ [[package]] name = "conserve" -version = "0.6.8" +version = "0.6.9-pre" dependencies = [ "assert_cmd", "assert_fs", diff --git a/NEWS.md b/NEWS.md index 29054d57..39d83e51 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,17 @@ # Conserve release history +## v0.6.9 NOT RELEASED YET + +### Features + +- New option `conserve size --bytes` gives output in bytes, rather than + megabytes. + +### Performance + +- Reading a subtree of a local source is faster, because it no longer reads and + discards the whole tree. + ## v0.6.8 2020-10-16 ### Features diff --git a/src/bin/conserve.rs b/src/bin/conserve.rs index ec245b4c..2ff8f12a 100644 --- a/src/bin/conserve.rs +++ b/src/bin/conserve.rs @@ -121,6 +121,10 @@ enum Command { Size { #[structopt(flatten)] stos: StoredTreeOrSource, + + /// Count in bytes, not megabytes. + #[structopt(long)] + bytes: bool, }, /// Check that an archive is internally consistent. @@ -322,20 +326,22 @@ impl Command { ui::println("Restore complete."); copy_stats.summarize_restore(&mut stdout)?; } - Command::Size { ref stos } => { - if !stos.exclude.is_empty() { - todo!("size with exclusions not implemented") - } + Command::Size { ref stos, bytes } => { + let excludes = Some(excludes::from_strings(&stos.exclude)?); let size = if let Some(archive) = &stos.archive { stored_tree_from_opt(archive, &stos.backup)? - .size()? + .size(excludes)? .file_bytes } else { LiveTree::open(stos.source.as_ref().unwrap())? - .size()? + .size(excludes)? .file_bytes }; - ui::println(&conserve::bytes_to_human_mb(size)); + if *bytes { + ui::println(&format!("{}", size)); + } else { + ui::println(&conserve::bytes_to_human_mb(size)); + } } Command::Validate { archive } => { let stats = Archive::open_path(archive)?.validate()?; diff --git a/src/copy_tree.rs b/src/copy_tree.rs index 05945281..607aef06 100644 --- a/src/copy_tree.rs +++ b/src/copy_tree.rs @@ -46,7 +46,7 @@ pub fn copy_tree( // again a second time? But, that'll potentially use memory proportional to tree size, which // I'd like to avoid, and also perhaps make it more likely we grumble about files that were // deleted or changed while this is running. - progress_bar.set_bytes_total(source.size()?.file_bytes as u64); + progress_bar.set_bytes_total(source.size(options.excludes.clone())?.file_bytes as u64); } progress_bar.set_phase("Copying".to_owned()); diff --git a/src/output.rs b/src/output.rs index 6a79853c..43477bbb 100644 --- a/src/output.rs +++ b/src/output.rs @@ -67,7 +67,7 @@ pub fn show_verbose_version_list( let tree_mb = crate::misc::bytes_to_human_mb( archive .open_stored_tree(BandSelectionPolicy::Specified(band_id.clone()))? - .size()? + .size(None)? .file_bytes, ); writeln!( diff --git a/src/tree.rs b/src/tree.rs index 281be7ac..0bb49c61 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -60,11 +60,11 @@ pub trait ReadTree { /// Measure the tree size. /// /// This typically requires walking all entries, which may take a while. - fn size(&self) -> Result { + fn size(&self, excludes: Option) -> Result { let mut progress_bar = ProgressBar::new(); progress_bar.set_phase("Measuring".to_owned()); let mut tot = 0u64; - for e in self.iter_entries()? { + for e in self.iter_filtered(None, excludes)? { // While just measuring size, ignore directories/files we can't stat. if let Some(bytes) = e.size() { tot += bytes; diff --git a/tests/cli.rs b/tests/cli.rs index 4c8f813a..d42b583f 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -469,3 +469,18 @@ fn delete_nonexistent_band() { .stdout(pred_fn) .failure(); } + +#[test] +fn size_exclude() { + let source = TreeFixture::new(); + source.create_file_with_contents("small", b"0123456789"); + source.create_file_with_contents("junk", b"01234567890123456789"); + + run_conserve() + .args(&["size", "--bytes", "--source"]) + .arg(&source.path()) + .args(&["--exclude=/junk"]) + .assert() + .success() + .stdout("10\n"); +} From ae744a1f9f14b358e65471b5ca45e1dca269d02b Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 11 Oct 2020 15:55:33 -0700 Subject: [PATCH 05/32] Clean up declaration of exclusions --- src/bin/conserve.rs | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/bin/conserve.rs b/src/bin/conserve.rs index 2ff8f12a..a9242266 100644 --- a/src/bin/conserve.rs +++ b/src/bin/conserve.rs @@ -99,6 +99,9 @@ enum Command { Ls { #[structopt(flatten)] stos: StoredTreeOrSource, + + #[structopt(long, short, number_of_values = 1)] + exclude: Vec, }, /// Copy a stored tree to a restore directory. @@ -125,6 +128,9 @@ enum Command { /// Count in bytes, not megabytes. #[structopt(long)] bytes: bool, + + #[structopt(long, short, number_of_values = 1)] + exclude: Vec, }, /// Check that an archive is internally consistent. @@ -156,9 +162,6 @@ struct StoredTreeOrSource { #[structopt(long, short, conflicts_with = "source")] backup: Option, - - #[structopt(long, short, number_of_values = 1)] - exclude: Vec, } /// Show debugging information. @@ -286,8 +289,8 @@ impl Command { Archive::create_path(&archive)?; ui::println(&format!("Created new archive in {:?}", &archive)); } - Command::Ls { stos } => { - let excludes = Some(excludes::from_strings(&stos.exclude)?); + Command::Ls { stos, exclude } => { + let excludes = Some(excludes::from_strings(exclude)?); if let Some(archive) = &stos.archive { output::show_entry_names( stored_tree_from_opt(archive, &stos.backup)? @@ -326,8 +329,12 @@ impl Command { ui::println("Restore complete."); copy_stats.summarize_restore(&mut stdout)?; } - Command::Size { ref stos, bytes } => { - let excludes = Some(excludes::from_strings(&stos.exclude)?); + Command::Size { + ref stos, + bytes, + ref exclude, + } => { + let excludes = Some(excludes::from_strings(exclude)?); let size = if let Some(archive) = &stos.archive { stored_tree_from_opt(archive, &stos.backup)? .size(excludes)? From 30c4d88b1296ad624b5186d9814f093648c42147 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 11 Oct 2020 16:03:10 -0700 Subject: [PATCH 06/32] Use Option for excludes --- src/archive.rs | 2 +- src/backup.rs | 6 +++--- src/bin/conserve.rs | 8 ++++---- src/excludes.rs | 21 ++++++++++++++++----- src/index.rs | 2 +- src/live_tree.rs | 2 +- src/restore.rs | 5 ++--- 7 files changed, 28 insertions(+), 18 deletions(-) diff --git a/src/archive.rs b/src/archive.rs index 0634821d..ff336e93 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -125,7 +125,7 @@ impl Archive { let opts = CopyOptions { print_filenames: options.print_filenames, only_subtree: options.only_subtree.clone(), - excludes: Some(options.excludes.clone()), + excludes: options.excludes.clone(), ..CopyOptions::default() }; copy_tree(&st, rt, &opts) diff --git a/src/backup.rs b/src/backup.rs index d7bfa2ac..8e9dfdbc 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -43,7 +43,7 @@ pub fn backup(archive: &Archive, source: &LiveTree, options: &BackupOptions) -> // } progress_bar.set_phase("Copying".to_owned()); - let entry_iter = source.iter_filtered(None, Some(options.excludes.clone()))?; + let entry_iter = source.iter_filtered(None, options.excludes.clone())?; for entry in entry_iter { if options.print_filenames { crate::ui::println(entry.apath()); @@ -91,14 +91,14 @@ pub struct BackupOptions { pub print_filenames: bool, /// Exclude these globs from the backup. - pub excludes: GlobSet, + pub excludes: Option, } impl Default for BackupOptions { fn default() -> Self { BackupOptions { print_filenames: false, - excludes: GlobSet::empty(), + excludes: None, } } } diff --git a/src/bin/conserve.rs b/src/bin/conserve.rs index a9242266..540d0220 100644 --- a/src/bin/conserve.rs +++ b/src/bin/conserve.rs @@ -266,8 +266,8 @@ impl Command { let lt = LiveTree::open(source)?; output::show_tree_diff( &mut MergeTrees::new( - st.iter_filtered(None, Some(excludes.clone()))?, - lt.iter_filtered(None, Some(excludes.clone()))?, + st.iter_filtered(None, excludes.clone())?, + lt.iter_filtered(None, excludes.clone())?, ), &mut stdout, )?; @@ -290,7 +290,7 @@ impl Command { ui::println(&format!("Created new archive in {:?}", &archive)); } Command::Ls { stos, exclude } => { - let excludes = Some(excludes::from_strings(exclude)?); + let excludes = excludes::from_strings(exclude)?; if let Some(archive) = &stos.archive { output::show_entry_names( stored_tree_from_opt(archive, &stos.backup)? @@ -334,7 +334,7 @@ impl Command { bytes, ref exclude, } => { - let excludes = Some(excludes::from_strings(exclude)?); + let excludes = excludes::from_strings(exclude)?; let size = if let Some(archive) = &stos.archive { stored_tree_from_opt(archive, &stos.backup)? .size(excludes)? diff --git a/src/excludes.rs b/src/excludes.rs index c9e38202..e5a87c4c 100644 --- a/src/excludes.rs +++ b/src/excludes.rs @@ -17,12 +17,19 @@ use globset::{Glob, GlobSet, GlobSetBuilder}; use super::*; -pub fn from_strings, S: AsRef>(excludes: I) -> Result { +pub fn from_strings, S: AsRef>( + excludes: I, +) -> Result> { let mut builder = GlobSetBuilder::new(); + let mut empty = true; for i in excludes { builder.add(Glob::new(i.as_ref()).map_err(|source| Error::ParseGlob { source })?); + empty = false; } - builder.build().map_err(Into::into) + if empty { + return Ok(None); + } + builder.build().map_err(Into::into).map(|gs| Some(gs)) } pub fn excludes_nothing() -> GlobSet { @@ -36,7 +43,7 @@ mod tests { #[test] pub fn simple_parse() { let vec = vec!["fo*", "foo", "bar*"]; - let excludes = excludes::from_strings(&vec).unwrap(); + let excludes = excludes::from_strings(&vec).expect("ok").expect("some"); assert_eq!(excludes.matches("foo").len(), 2); assert_eq!(excludes.matches("foobar").len(), 1); assert_eq!(excludes.matches("barBaz").len(), 1); @@ -45,13 +52,17 @@ mod tests { #[test] pub fn path_parse() { - let excludes = excludes::from_strings(&["fo*/bar/baz*"]).unwrap(); + let excludes = excludes::from_strings(&["fo*/bar/baz*"]) + .expect("ok") + .expect("some"); assert_eq!(excludes.matches("foo/bar/baz.rs").len(), 1); } #[test] pub fn extendend_pattern_parse() { - let excludes = excludes::from_strings(&["fo?", "ba[abc]", "[!a-z]"]).unwrap(); + let excludes = excludes::from_strings(&["fo?", "ba[abc]", "[!a-z]"]) + .expect("ok") + .expect("some"); assert_eq!(excludes.matches("foo").len(), 1); assert_eq!(excludes.matches("fo").len(), 0); assert_eq!(excludes.matches("baa").len(), 1); diff --git a/src/index.rs b/src/index.rs index cb18e416..bdf468e4 100644 --- a/src/index.rs +++ b/src/index.rs @@ -739,7 +739,7 @@ mod tests { let it = IndexRead::open_path(&testdir.path()) .iter_entries() .unwrap() - .with_excludes(excludes); + .with_excludes(excludes.unwrap()); let names: Vec = it.map(|x| x.apath.into()).collect(); assert_eq!(names, &["/bar"]); diff --git a/src/live_tree.rs b/src/live_tree.rs index 425e2062..822f82ee 100644 --- a/src/live_tree.rs +++ b/src/live_tree.rs @@ -418,7 +418,7 @@ mod tests { let excludes = excludes::from_strings(&["/**/fooo*", "/**/ba[pqr]", "/**/*bas"]).unwrap(); let lt = LiveTree::open(tf.path()).unwrap(); - let mut source_iter = lt.iter_filtered(None, Some(excludes)).unwrap(); + let mut source_iter = lt.iter_filtered(None, excludes).unwrap(); let result = source_iter.by_ref().collect::>(); // First one is the root diff --git a/src/restore.rs b/src/restore.rs index a7c1c60c..3a904e5f 100644 --- a/src/restore.rs +++ b/src/restore.rs @@ -22,7 +22,6 @@ use globset::GlobSet; use crate::band::BandSelectionPolicy; use crate::entry::Entry; -use crate::excludes; use crate::io::{directory_is_empty, ensure_dir_exists}; use crate::stats::CopyStats; use crate::*; @@ -31,7 +30,7 @@ use crate::*; #[derive(Debug)] pub struct RestoreOptions { pub print_filenames: bool, - pub excludes: GlobSet, + pub excludes: Option, /// Restore only this subdirectory. pub only_subtree: Option, pub overwrite: bool, @@ -45,7 +44,7 @@ impl Default for RestoreOptions { print_filenames: false, overwrite: false, band_selection: BandSelectionPolicy::LatestClosed, - excludes: excludes::excludes_nothing(), + excludes: None, only_subtree: None, } } From 0aca76d67b2ba3d312bdd012728f53058f982c5c Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 11 Oct 2020 16:06:54 -0700 Subject: [PATCH 07/32] Don't need exclusions at the index level --- src/index.rs | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/src/index.rs b/src/index.rs index bdf468e4..7872e1dc 100644 --- a/src/index.rs +++ b/src/index.rs @@ -19,8 +19,6 @@ use std::iter::Peekable; use std::path::Path; use std::vec; -use globset::GlobSet; - use crate::compress::snappy::{Compressor, Decompressor}; use crate::kind::Kind; use crate::stats::{IndexBuilderStats, IndexReadStats}; @@ -276,7 +274,6 @@ impl IndexRead { pub fn iter_entries(&self) -> Result { Ok(IndexEntryIter { buffered_entries: Vec::::new().into_iter().peekable(), - excludes: excludes::excludes_nothing(), hunk_iter: self.iter_hunks(), }) } @@ -398,7 +395,6 @@ pub struct IndexEntryIter { /// Temporarily buffered entries, read from the index files but not yet /// returned to the client. buffered_entries: Peekable>, - excludes: GlobSet, hunk_iter: IndexHunkIter, } @@ -407,10 +403,8 @@ impl Iterator for IndexEntryIter { fn next(&mut self) -> Option { loop { - while let Some(entry) = self.buffered_entries.next() { - if !self.excludes.is_match(&entry.apath) { - return Some(entry); - } + if let Some(entry) = self.buffered_entries.next() { + return Some(entry); } if !self.refill_entry_buffer_or_warn() { return None; @@ -420,11 +414,6 @@ impl Iterator for IndexEntryIter { } impl IndexEntryIter { - /// Consume this iterator and return a new one with exclusions. - pub fn with_excludes(self, excludes: globset::GlobSet) -> IndexEntryIter { - IndexEntryIter { excludes, ..self } - } - /// Return the entry for given apath, if it is present, otherwise None. /// It follows this will also return None at the end of the index. /// @@ -727,24 +716,6 @@ mod tests { add_an_entry(&mut ib, "hello"); } - #[test] - fn excluded_entries() { - let (testdir, mut ib) = scratch_indexbuilder(); - add_an_entry(&mut ib, "/bar"); - add_an_entry(&mut ib, "/foo"); - add_an_entry(&mut ib, "/foobar"); - ib.finish_hunk().unwrap(); - - let excludes = excludes::from_strings(&["/fo*"]).unwrap(); - let it = IndexRead::open_path(&testdir.path()) - .iter_entries() - .unwrap() - .with_excludes(excludes.unwrap()); - - let names: Vec = it.map(|x| x.apath.into()).collect(); - assert_eq!(names, &["/bar"]); - } - #[test] fn advance() { let (testdir, mut ib) = scratch_indexbuilder(); From b0ee766534d1598613a52f2fb05986c4ed146ff7 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 11 Oct 2020 16:21:10 -0700 Subject: [PATCH 08/32] Simplify backup code No longer uses WriteTree trait --- src/backup.rs | 57 +++++++++++++++++++-------------------------------- 1 file changed, 21 insertions(+), 36 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index 8e9dfdbc..8cc33a5d 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -22,6 +22,16 @@ use crate::stats::CopyStats; use crate::tree::ReadTree; use crate::*; +/// Configuration of how to make a backup. +#[derive(Debug, Default)] +pub struct BackupOptions { + /// Print filenames to the UI as they're copied. + pub print_filenames: bool, + + /// Exclude these globs from the backup. + pub excludes: Option, +} + /// Backup a source directory into a new band in the archive. /// /// Returns statistics about what was copied. @@ -56,11 +66,14 @@ pub fn backup(archive: &Archive, source: &LiveTree, options: &BackupOptions) -> } Kind::File => { stats.files += 1; - let result = writer.copy_file(&entry, source).map(|s| stats += s); + let result = writer.copy_file(&entry, source); + if let Ok(file_stats) = &result { + stats += file_stats.clone() + } if let Some(bytes) = entry.size() { progress_bar.increment_bytes_done(bytes); } - result + result.and(Ok(())) } Kind::Symlink => { stats.symlinks += 1; @@ -84,25 +97,6 @@ pub fn backup(archive: &Archive, source: &LiveTree, options: &BackupOptions) -> Ok(stats) } -/// Configuration of how to make a backup. -#[derive(Debug)] -pub struct BackupOptions { - /// Print filenames to the UI as they're copied. - pub print_filenames: bool, - - /// Exclude these globs from the backup. - pub excludes: Option, -} - -impl Default for BackupOptions { - fn default() -> Self { - BackupOptions { - print_filenames: false, - excludes: None, - } - } -} - /// Accepts files to write in the archive (in apath order.) pub struct BackupWriter { band: Band, @@ -137,17 +131,6 @@ impl BackupWriter { }) } - /// Push a new entry into the backup's IndexBuilder. - /// - /// This is public only to facilitate testing. - pub(crate) fn push_entry(&mut self, index_entry: IndexEntry) -> Result<()> { - // TODO: Return or accumulate index sizes. - self.index_builder.push_entry(index_entry)?; - Ok(()) - } -} - -impl tree::WriteTree for BackupWriter { fn finish(self) -> Result { let index_builder_stats = self.index_builder.finish()?; self.band.close(index_builder_stats.index_hunks)?; @@ -159,7 +142,8 @@ impl tree::WriteTree for BackupWriter { fn copy_dir(&mut self, source_entry: &E) -> Result<()> { // TODO: Pass back index sizes - self.push_entry(IndexEntry::metadata_from(source_entry)) + self.index_builder + .push_entry(IndexEntry::metadata_from(source_entry)) } /// Copy in the contents of a file from another tree. @@ -186,7 +170,7 @@ impl tree::WriteTree for BackupWriter { // with the archive invariants, which include that all the // blocks referenced by the index, are actually present. stats.unmodified_files += 1; - self.push_entry(basis_entry)?; + self.index_builder.push_entry(basis_entry)?; return Ok(stats); } else { stats.modified_files += 1; @@ -199,7 +183,7 @@ impl tree::WriteTree for BackupWriter { // then downcast it to Read. let (addrs, file_stats) = self.store_files.store_file_content(&apath, content)?; stats += file_stats; - self.push_entry(IndexEntry { + self.index_builder.push_entry(IndexEntry { addrs, ..IndexEntry::metadata_from(source_entry) })?; @@ -209,6 +193,7 @@ impl tree::WriteTree for BackupWriter { fn copy_symlink(&mut self, source_entry: &E) -> Result<()> { let target = source_entry.symlink_target().clone(); assert!(target.is_some()); - self.push_entry(IndexEntry::metadata_from(source_entry)) + self.index_builder + .push_entry(IndexEntry::metadata_from(source_entry)) } } From 787395c6730abee0a3576486849cae586205851a Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 11 Oct 2020 16:32:56 -0700 Subject: [PATCH 09/32] Remove old comment --- src/backup.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index 8cc33a5d..2beb775e 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -178,10 +178,10 @@ impl BackupWriter { } else { stats.new_files += 1; } - let content = &mut from_tree.file_contents(&source_entry)?; - // TODO: Don't read the whole file into memory, but especially don't do that and - // then downcast it to Read. - let (addrs, file_stats) = self.store_files.store_file_content(&apath, content)?; + let read_source = from_tree.file_contents(&source_entry); + let (addrs, file_stats) = self + .store_files + .store_file_content(&apath, &mut read_source?)?; stats += file_stats; self.index_builder.push_entry(IndexEntry { addrs, From e075bc00cf24d2ffd34e6ca848f673bc44f8d4f5 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Mon, 12 Oct 2020 07:13:13 -0700 Subject: [PATCH 10/32] Add todo --- src/backup.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/backup.rs b/src/backup.rs index 2beb775e..654ef474 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -116,6 +116,8 @@ impl BackupWriter { if gc_lock::GarbageCollectionLock::is_locked(archive)? { return Err(Error::GarbageCollectionLockHeld); } + // TODO: Use a stitched band as the basis. + // let basis_index = archive .last_complete_band()? .map(|b| b.iter_entries()) From 265079358c3c66267e8d14e4c341d8484dba30ca Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 17 Oct 2020 09:08:56 -0700 Subject: [PATCH 11/32] Refactor backup towards processing group-at-a-time This changes the backup file-count stats to include files that later failed to be stored. --- Cargo.lock | 10 +++++ Cargo.toml | 1 + src/backup.rs | 94 ++++++++++++++++++++++---------------------- tests/integration.rs | 2 +- 4 files changed, 58 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e0ea85b8..2bc87e5e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -159,6 +159,7 @@ dependencies = [ "escargot", "globset", "hex", + "itertools", "lazy_static", "libc", "predicates", @@ -396,6 +397,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "itertools" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "0.4.6" diff --git a/Cargo.toml b/Cargo.toml index f60593ca..8a88c27c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ crossterm = "0.17.5" derive_more = "0.99.7" globset = "0.4.5" hex = "0.4.2" +itertools = "0.9.0" lazy_static = "1.4.0" rayon = "1.3.0" regex = "1.3.9" diff --git a/src/backup.rs b/src/backup.rs index 654ef474..a093755f 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -15,6 +15,7 @@ //! into an archive. use globset::GlobSet; +use itertools::Itertools; use crate::blockdir::StoreFiles; use crate::index::IndexEntryIter; @@ -54,43 +55,26 @@ pub fn backup(archive: &Archive, source: &LiveTree, options: &BackupOptions) -> progress_bar.set_phase("Copying".to_owned()); let entry_iter = source.iter_filtered(None, options.excludes.clone())?; - for entry in entry_iter { - if options.print_filenames { - crate::ui::println(entry.apath()); - } - progress_bar.set_filename(entry.apath().to_string()); - if let Err(e) = match entry.kind() { - Kind::Dir => { - stats.directories += 1; - writer.copy_dir(&entry) - } - Kind::File => { - stats.files += 1; - let result = writer.copy_file(&entry, source); - if let Ok(file_stats) = &result { - stats += file_stats.clone() - } - if let Some(bytes) = entry.size() { - progress_bar.increment_bytes_done(bytes); - } - result.and(Ok(())) - } - Kind::Symlink => { - stats.symlinks += 1; - writer.copy_symlink(&entry) + for entry_group in entry_iter + .chunks(crate::index::MAX_ENTRIES_PER_HUNK) + .into_iter() + { + for entry in entry_group { + if options.print_filenames { + crate::ui::println(entry.apath()); } - Kind::Unknown => { - stats.unknown_kind += 1; - // TODO: Perhaps eventually we could backup and restore pipes, - // sockets, etc. Or at least count them. For now, silently skip. - // https://github.com/sourcefrog/conserve/issues/82 + progress_bar.set_filename(entry.apath().to_string()); + if let Err(e) = writer.copy_entry(&entry, source) { + ui::show_error(&e); + stats.errors += 1; continue; } - } { - ui::show_error(&e); - stats.errors += 1; - continue; + if let Some(bytes) = entry.size() { + progress_bar.increment_bytes_done(bytes); + } } + // TODO: Finish any compression groups. + // TODO: Sort and then write out the index hunk for this group. } stats += writer.finish()?; // TODO: Merge in stats from the tree iter and maybe the source tree? @@ -98,10 +82,11 @@ pub fn backup(archive: &Archive, source: &LiveTree, options: &BackupOptions) -> } /// Accepts files to write in the archive (in apath order.) -pub struct BackupWriter { +struct BackupWriter { band: Band, index_builder: IndexBuilder, store_files: StoreFiles, + stats: CopyStats, /// The index for the last stored band, used as hints for whether newly /// stored files have changed. @@ -129,6 +114,7 @@ impl BackupWriter { band, index_builder, store_files: StoreFiles::new(archive.block_dir().clone()), + stats: CopyStats::default(), basis_index, }) } @@ -138,23 +124,35 @@ impl BackupWriter { self.band.close(index_builder_stats.index_hunks)?; Ok(CopyStats { index_builder_stats, - ..CopyStats::default() + ..self.stats }) } + fn copy_entry(&mut self, entry: &R::Entry, source: &R) -> Result<()> { + match entry.kind() { + Kind::Dir => self.copy_dir(entry), + Kind::File => self.copy_file(entry, source), + Kind::Symlink => self.copy_symlink(entry), + Kind::Unknown => { + self.stats.unknown_kind += 1; + // TODO: Perhaps eventually we could backup and restore pipes, + // sockets, etc. Or at least count them. For now, silently skip. + // https://github.com/sourcefrog/conserve/issues/82 + Ok(()) + } + } + } + fn copy_dir(&mut self, source_entry: &E) -> Result<()> { // TODO: Pass back index sizes + self.stats.directories += 1; self.index_builder .push_entry(IndexEntry::metadata_from(source_entry)) } /// Copy in the contents of a file from another tree. - fn copy_file( - &mut self, - source_entry: &R::Entry, - from_tree: &R, - ) -> Result { - let mut stats = CopyStats::default(); + fn copy_file(&mut self, source_entry: &R::Entry, from_tree: &R) -> Result<()> { + self.stats.files += 1; let apath = source_entry.apath(); if let Some(basis_entry) = self .basis_index @@ -171,29 +169,29 @@ impl BackupWriter { // We can reasonably assume that the existing archive complies // with the archive invariants, which include that all the // blocks referenced by the index, are actually present. - stats.unmodified_files += 1; + self.stats.unmodified_files += 1; self.index_builder.push_entry(basis_entry)?; - return Ok(stats); + return Ok(()); } else { - stats.modified_files += 1; + self.stats.modified_files += 1; } } else { - stats.new_files += 1; + self.stats.new_files += 1; } let read_source = from_tree.file_contents(&source_entry); let (addrs, file_stats) = self .store_files .store_file_content(&apath, &mut read_source?)?; - stats += file_stats; + self.stats += file_stats; self.index_builder.push_entry(IndexEntry { addrs, ..IndexEntry::metadata_from(source_entry) - })?; - Ok(stats) + }) } fn copy_symlink(&mut self, source_entry: &E) -> Result<()> { let target = source_entry.symlink_target().clone(); + self.stats.symlinks += 1; assert!(target.is_some()); self.index_builder .push_entry(IndexEntry::metadata_from(source_entry)) diff --git a/tests/integration.rs b/tests/integration.rs index 41af18f6..da2b7fe2 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -227,7 +227,7 @@ fn source_unreadable() { let stats = backup(&af, &tf.live_tree(), &BackupOptions::default()).expect("backup"); assert_eq!(stats.errors, 1); - assert_eq!(stats.new_files, 2); + assert_eq!(stats.new_files, 3); assert_eq!(stats.files, 3); // TODO: On Windows change the ACL to make the file unreadable to the current user or to From b2af71eae534ebbc97cb9f14af74efe9017cb688 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 18 Oct 2020 07:43:41 -0700 Subject: [PATCH 12/32] Move StoreFiles to backup.rs --- src/backup.rs | 303 +++++++++++++++++++++++++++++++++++++++++++++- src/blockdir.rs | 310 +----------------------------------------------- 2 files changed, 308 insertions(+), 305 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index a093755f..5e3d632c 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -14,10 +14,12 @@ //! Make a backup by walking a source directory and copying the contents //! into an archive. +use std::io::prelude::*; + use globset::GlobSet; use itertools::Itertools; -use crate::blockdir::StoreFiles; +use crate::blockdir::Address; use crate::index::IndexEntryIter; use crate::stats::CopyStats; use crate::tree::ReadTree; @@ -197,3 +199,302 @@ impl BackupWriter { .push_entry(IndexEntry::metadata_from(source_entry)) } } + +/// Manages storage into the BlockDir of any number of files. +/// +/// At present this just holds a reusable input buffer. +/// +/// In future it will combine small files into aggregate blocks, +/// and perhaps compress them in parallel. +pub(crate) struct StoreFiles { + // TODO: Rename to FileWriter or similar? Perhaps doesn't need to be + // separate from BackupWriter. + block_dir: BlockDir, + input_buf: Vec, +} + +impl StoreFiles { + pub(crate) fn new(block_dir: BlockDir) -> StoreFiles { + StoreFiles { + block_dir, + input_buf: vec![0; MAX_BLOCK_SIZE], + } + } + + pub(crate) fn store_file_content( + &mut self, + apath: &Apath, + from_file: &mut dyn Read, + ) -> Result<(Vec
, CopyStats)> { + let mut addresses = Vec::
::with_capacity(1); + let mut stats = CopyStats::default(); + loop { + // TODO: Possibly read repeatedly in case we get a short read and have room for more, + // so that short reads don't lead to short blocks being stored. + // TODO: Error should actually be an error about the source file? + // TODO: This shouldn't directly read from the source, it should take blocks in. + let read_len = + from_file + .read(&mut self.input_buf) + .map_err(|source| Error::StoreFile { + apath: apath.to_owned(), + source, + })?; + if read_len == 0 { + break; + } + let block_data = &self.input_buf[..read_len]; + let hash = self.block_dir.hash_bytes(block_data)?; + if self.block_dir.contains(&hash)? { + // TODO: Separate counter for size of the already-present blocks? + stats.deduplicated_blocks += 1; + stats.deduplicated_bytes += read_len as u64; + } else { + let comp_len = self.block_dir.compress_and_store(block_data, &hash)?; + stats.written_blocks += 1; + stats.uncompressed_bytes += read_len as u64; + stats.compressed_bytes += comp_len; + } + addresses.push(Address { + hash, + start: 0, + len: read_len as u64, + }); + } + match addresses.len() { + 0 => stats.empty_files += 1, + 1 => stats.single_block_files += 1, + _ => stats.multi_block_files += 1, + } + Ok((addresses, stats)) + } +} + +#[cfg(test)] +mod tests { + use std::fs; + use std::io::prelude::*; + use std::io::{Cursor, SeekFrom}; + + use tempfile::{NamedTempFile, TempDir}; + + use crate::stats::Sizes; + + use super::*; + + const EXAMPLE_TEXT: &[u8] = b"hello!"; + const EXAMPLE_BLOCK_HASH: &str = "66ad1939a9289aa9f1f1d9ad7bcee694293c7623affb5979bd\ + 3f844ab4adcf2145b117b7811b3cee31e130efd760e9685f208c2b2fb1d67e28262168013ba63c"; + + fn make_example_file() -> NamedTempFile { + let mut tf = NamedTempFile::new().unwrap(); + tf.write_all(EXAMPLE_TEXT).unwrap(); + tf.flush().unwrap(); + tf.seek(SeekFrom::Start(0)).unwrap(); + tf + } + + fn setup() -> (TempDir, BlockDir) { + let testdir = TempDir::new().unwrap(); + let block_dir = BlockDir::create_path(testdir.path()).unwrap(); + (testdir, block_dir) + } + + #[test] + pub fn store_a_file() { + let expected_hash = EXAMPLE_BLOCK_HASH.to_string().parse().unwrap(); + let (testdir, block_dir) = setup(); + let mut example_file = make_example_file(); + + assert_eq!(block_dir.contains(&expected_hash).unwrap(), false); + let mut store = StoreFiles::new(block_dir.clone()); + + let (addrs, stats) = store + .store_file_content(&Apath::from("/hello"), &mut example_file) + .unwrap(); + + // Should be in one block, with the expected hash. + assert_eq!(1, addrs.len()); + assert_eq!(0, addrs[0].start); + assert_eq!(addrs[0].hash, expected_hash); + + // Block should be the one block present in the list. + let present_blocks = block_dir.block_names().unwrap().collect::>(); + assert_eq!(present_blocks.len(), 1); + assert_eq!(present_blocks[0], expected_hash); + + // Subdirectory and file should exist + let expected_file = testdir.path().join("66a").join(EXAMPLE_BLOCK_HASH); + let attr = fs::metadata(expected_file).unwrap(); + assert!(attr.is_file()); + + // Compressed size is as expected. + assert_eq!(block_dir.compressed_size(&expected_hash).unwrap(), 8); + + assert_eq!(block_dir.contains(&expected_hash).unwrap(), true); + + assert_eq!(stats.deduplicated_blocks, 0); + assert_eq!(stats.written_blocks, 1); + assert_eq!(stats.uncompressed_bytes, 6); + assert_eq!(stats.compressed_bytes, 8); + + // Will vary depending on compressor and we don't want to be too brittle. + assert!(stats.compressed_bytes <= 19, stats.compressed_bytes); + + // Try to read back + let (back, sizes) = block_dir.get(&addrs[0]).unwrap(); + assert_eq!(back, EXAMPLE_TEXT); + assert_eq!( + sizes, + Sizes { + uncompressed: EXAMPLE_TEXT.len() as u64, + compressed: 8u64, + } + ); + + let mut stats = ValidateStats::default(); + block_dir.validate(&mut stats).unwrap(); + assert_eq!(stats.io_errors, 0); + assert_eq!(stats.block_error_count, 0); + assert_eq!(stats.block_read_count, 1); + } + + #[test] + fn retrieve_partial_data() { + let (_testdir, block_dir) = setup(); + let mut store_files = StoreFiles::new(block_dir.clone()); + let (addrs, _stats) = store_files + .store_file_content(&"/hello".into(), &mut Cursor::new(b"0123456789abcdef")) + .unwrap(); + assert_eq!(addrs.len(), 1); + let hash = addrs[0].hash.clone(); + let first_half = Address { + start: 0, + len: 8, + hash, + }; + let (first_half_content, _first_half_stats) = block_dir.get(&first_half).unwrap(); + assert_eq!(first_half_content, b"01234567"); + + let hash = addrs[0].hash.clone(); + let second_half = Address { + start: 8, + len: 8, + hash, + }; + let (second_half_content, _second_half_stats) = block_dir.get(&second_half).unwrap(); + assert_eq!(second_half_content, b"89abcdef"); + } + + #[test] + fn invalid_addresses() { + let (_testdir, block_dir) = setup(); + let mut store_files = StoreFiles::new(block_dir.clone()); + let (addrs, _stats) = store_files + .store_file_content(&"/hello".into(), &mut Cursor::new(b"0123456789abcdef")) + .unwrap(); + assert_eq!(addrs.len(), 1); + + // Address with start point too high. + let hash = addrs[0].hash.clone(); + let starts_too_late = Address { + hash: hash.clone(), + start: 16, + len: 2, + }; + let result = block_dir.get(&starts_too_late); + assert_eq!( + &result.err().unwrap().to_string(), + &format!( + "Address {{ hash: {:?}, start: 16, len: 2 }} \ + extends beyond decompressed block length 16", + hash + ) + ); + + // Address with length too long. + let too_long = Address { + hash: hash.clone(), + start: 10, + len: 10, + }; + let result = block_dir.get(&too_long); + assert_eq!( + &result.err().unwrap().to_string(), + &format!( + "Address {{ hash: {:?}, start: 10, len: 10 }} \ + extends beyond decompressed block length 16", + hash + ) + ); + } + + #[test] + pub fn write_same_data_again() { + let (_testdir, block_dir) = setup(); + + let mut example_file = make_example_file(); + let mut store = StoreFiles::new(block_dir); + let (addrs1, stats) = store + .store_file_content(&Apath::from("/ello"), &mut example_file) + .unwrap(); + assert_eq!(stats.deduplicated_blocks, 0); + assert_eq!(stats.written_blocks, 1); + assert_eq!(stats.uncompressed_bytes, 6); + assert_eq!(stats.compressed_bytes, 8); + + let mut example_file = make_example_file(); + let (addrs2, stats2) = store + .store_file_content(&Apath::from("/ello2"), &mut example_file) + .unwrap(); + assert_eq!(stats2.deduplicated_blocks, 1); + assert_eq!(stats2.written_blocks, 0); + assert_eq!(stats2.compressed_bytes, 0); + + assert_eq!(addrs1, addrs2); + } + + #[test] + // Large enough that it should break across blocks. + pub fn large_file() { + use super::MAX_BLOCK_SIZE; + let (_testdir, block_dir) = setup(); + let mut tf = NamedTempFile::new().unwrap(); + const N_CHUNKS: u64 = 10; + const CHUNK_SIZE: u64 = 1 << 21; + const TOTAL_SIZE: u64 = N_CHUNKS * CHUNK_SIZE; + let a_chunk = vec![b'@'; CHUNK_SIZE as usize]; + for _i in 0..N_CHUNKS { + tf.write_all(&a_chunk).unwrap(); + } + tf.flush().unwrap(); + let tf_len = tf.seek(SeekFrom::Current(0)).unwrap(); + println!("tf len={}", tf_len); + assert_eq!(tf_len, TOTAL_SIZE); + tf.seek(SeekFrom::Start(0)).unwrap(); + + let mut store = StoreFiles::new(block_dir.clone()); + let (addrs, stats) = store + .store_file_content(&Apath::from("/big"), &mut tf) + .unwrap(); + + // Only one block needs to get compressed. The others are deduplicated. + assert_eq!(stats.uncompressed_bytes, MAX_BLOCK_SIZE as u64); + // Should be very compressible + assert!(stats.compressed_bytes < (MAX_BLOCK_SIZE as u64 / 10)); + assert_eq!(stats.written_blocks, 1); + assert_eq!( + stats.deduplicated_blocks as u64, + TOTAL_SIZE / (MAX_BLOCK_SIZE as u64) - 1 + ); + + // 10x 2MB should be twenty blocks + assert_eq!(addrs.len(), 20); + for a in addrs { + let (retr, block_sizes) = block_dir.get(&a).unwrap(); + assert_eq!(retr.len(), MAX_BLOCK_SIZE as usize); + assert!(retr.iter().all(|b| *b == 64u8)); + assert_eq!(block_sizes.uncompressed, MAX_BLOCK_SIZE as u64); + } + } +} diff --git a/src/blockdir.rs b/src/blockdir.rs index cf42da90..1e03b49d 100644 --- a/src/blockdir.rs +++ b/src/blockdir.rs @@ -24,7 +24,6 @@ use std::collections::HashMap; use std::convert::TryInto; use std::io; -use std::io::prelude::*; use std::path::Path; use std::sync::Mutex; @@ -37,7 +36,7 @@ use thousands::Separable; use crate::blockhash::BlockHash; use crate::compress::snappy::{Compressor, Decompressor}; use crate::kind::Kind; -use crate::stats::{CopyStats, Sizes, ValidateStats}; +use crate::stats::{Sizes, ValidateStats}; use crate::transport::local::LocalTransport; use crate::transport::{DirEntry, ListDirNames, Transport}; use crate::*; @@ -104,7 +103,7 @@ impl BlockDir { } /// Returns the number of compressed bytes. - fn compress_and_store(&mut self, in_buf: &[u8], hash: &BlockHash) -> Result { + pub(crate) fn compress_and_store(&mut self, in_buf: &[u8], hash: &BlockHash) -> Result { // TODO: Move this to a BlockWriter, which can hold a reusable buffer. let mut compressor = Compressor::new(); let compressed = compressor.compress(&in_buf)?; @@ -317,307 +316,10 @@ impl BlockDir { }; Ok((decompressor.take_buffer(), sizes)) } -} - -/// Manages storage into the BlockDir of any number of files. -/// -/// At present this just holds a reusable input buffer. -/// -/// In future it will combine small files into aggregate blocks, -/// and perhaps compress them in parallel. -pub(crate) struct StoreFiles { - // TODO: Rename to FileWriter or similar? Perhaps doesn't need to be - // separate from BackupWriter. - block_dir: BlockDir, - input_buf: Vec, -} - -impl StoreFiles { - pub(crate) fn new(block_dir: BlockDir) -> StoreFiles { - StoreFiles { - block_dir, - input_buf: vec![0; MAX_BLOCK_SIZE], - } - } - - pub(crate) fn store_file_content( - &mut self, - apath: &Apath, - from_file: &mut dyn Read, - ) -> Result<(Vec
, CopyStats)> { - let mut addresses = Vec::
::with_capacity(1); - let mut stats = CopyStats::default(); - loop { - // TODO: Possibly read repeatedly in case we get a short read and have room for more, - // so that short reads don't lead to short blocks being stored. - // TODO: Error should actually be an error about the source file? - // TODO: This shouldn't directly read from the source, it should take blocks in. - let read_len = - from_file - .read(&mut self.input_buf) - .map_err(|source| Error::StoreFile { - apath: apath.to_owned(), - source, - })?; - if read_len == 0 { - break; - } - let block_data = &self.input_buf[..read_len]; - let hash = hash_bytes(block_data)?; - if self.block_dir.contains(&hash)? { - // TODO: Separate counter for size of the already-present blocks? - stats.deduplicated_blocks += 1; - stats.deduplicated_bytes += read_len as u64; - } else { - let comp_len = self.block_dir.compress_and_store(block_data, &hash)?; - stats.written_blocks += 1; - stats.uncompressed_bytes += read_len as u64; - stats.compressed_bytes += comp_len; - } - addresses.push(Address { - hash, - start: 0, - len: read_len as u64, - }); - } - match addresses.len() { - 0 => stats.empty_files += 1, - 1 => stats.single_block_files += 1, - _ => stats.multi_block_files += 1, - } - Ok((addresses, stats)) - } -} - -fn hash_bytes(in_buf: &[u8]) -> Result { - let mut hasher = Blake2b::new(BLAKE_HASH_SIZE_BYTES); - hasher.update(in_buf); - Ok(BlockHash::from(hasher.finalize())) -} - -#[cfg(test)] -mod tests { - use std::fs; - use std::io::prelude::*; - use std::io::SeekFrom; - - use tempfile::{NamedTempFile, TempDir}; - - use super::*; - - const EXAMPLE_TEXT: &[u8] = b"hello!"; - const EXAMPLE_BLOCK_HASH: &str = "66ad1939a9289aa9f1f1d9ad7bcee694293c7623affb5979bd\ - 3f844ab4adcf2145b117b7811b3cee31e130efd760e9685f208c2b2fb1d67e28262168013ba63c"; - - fn make_example_file() -> NamedTempFile { - let mut tf = NamedTempFile::new().unwrap(); - tf.write_all(EXAMPLE_TEXT).unwrap(); - tf.flush().unwrap(); - tf.seek(SeekFrom::Start(0)).unwrap(); - tf - } - - fn setup() -> (TempDir, BlockDir) { - let testdir = TempDir::new().unwrap(); - let block_dir = BlockDir::create_path(testdir.path()).unwrap(); - (testdir, block_dir) - } - - #[test] - pub fn store_a_file() { - let expected_hash = EXAMPLE_BLOCK_HASH.to_string().parse().unwrap(); - let (testdir, block_dir) = setup(); - let mut example_file = make_example_file(); - - assert_eq!(block_dir.contains(&expected_hash).unwrap(), false); - let mut store = StoreFiles::new(block_dir.clone()); - - let (addrs, stats) = store - .store_file_content(&Apath::from("/hello"), &mut example_file) - .unwrap(); - - // Should be in one block, with the expected hash. - assert_eq!(1, addrs.len()); - assert_eq!(0, addrs[0].start); - assert_eq!(addrs[0].hash, expected_hash); - // Block should be the one block present in the list. - let present_blocks = block_dir.block_names().unwrap().collect::>(); - assert_eq!(present_blocks.len(), 1); - assert_eq!(present_blocks[0], expected_hash); - - // Subdirectory and file should exist - let expected_file = testdir.path().join("66a").join(EXAMPLE_BLOCK_HASH); - let attr = fs::metadata(expected_file).unwrap(); - assert!(attr.is_file()); - - // Compressed size is as expected. - assert_eq!(block_dir.compressed_size(&expected_hash).unwrap(), 8); - - assert_eq!(block_dir.contains(&expected_hash).unwrap(), true); - - assert_eq!(stats.deduplicated_blocks, 0); - assert_eq!(stats.written_blocks, 1); - assert_eq!(stats.uncompressed_bytes, 6); - assert_eq!(stats.compressed_bytes, 8); - - // Will vary depending on compressor and we don't want to be too brittle. - assert!(stats.compressed_bytes <= 19, stats.compressed_bytes); - - // Try to read back - let (back, sizes) = block_dir.get(&addrs[0]).unwrap(); - assert_eq!(back, EXAMPLE_TEXT); - assert_eq!( - sizes, - Sizes { - uncompressed: EXAMPLE_TEXT.len() as u64, - compressed: 8u64, - } - ); - - let mut stats = ValidateStats::default(); - block_dir.validate(&mut stats).unwrap(); - assert_eq!(stats.io_errors, 0); - assert_eq!(stats.block_error_count, 0); - assert_eq!(stats.block_read_count, 1); - } - - #[test] - fn retrieve_partial_data() { - let (_testdir, block_dir) = setup(); - let mut store_files = StoreFiles::new(block_dir.clone()); - let (addrs, _stats) = store_files - .store_file_content(&"/hello".into(), &mut io::Cursor::new(b"0123456789abcdef")) - .unwrap(); - assert_eq!(addrs.len(), 1); - let hash = addrs[0].hash.clone(); - let first_half = Address { - start: 0, - len: 8, - hash, - }; - let (first_half_content, _first_half_stats) = block_dir.get(&first_half).unwrap(); - assert_eq!(first_half_content, b"01234567"); - - let hash = addrs[0].hash.clone(); - let second_half = Address { - start: 8, - len: 8, - hash, - }; - let (second_half_content, _second_half_stats) = block_dir.get(&second_half).unwrap(); - assert_eq!(second_half_content, b"89abcdef"); - } - - #[test] - fn invalid_addresses() { - let (_testdir, block_dir) = setup(); - let mut store_files = StoreFiles::new(block_dir.clone()); - let (addrs, _stats) = store_files - .store_file_content(&"/hello".into(), &mut io::Cursor::new(b"0123456789abcdef")) - .unwrap(); - assert_eq!(addrs.len(), 1); - - // Address with start point too high. - let hash = addrs[0].hash.clone(); - let starts_too_late = Address { - hash: hash.clone(), - start: 16, - len: 2, - }; - let result = block_dir.get(&starts_too_late); - assert_eq!( - &result.err().unwrap().to_string(), - &format!( - "Address {{ hash: {:?}, start: 16, len: 2 }} \ - extends beyond decompressed block length 16", - hash - ) - ); - - // Address with length too long. - let too_long = Address { - hash: hash.clone(), - start: 10, - len: 10, - }; - let result = block_dir.get(&too_long); - assert_eq!( - &result.err().unwrap().to_string(), - &format!( - "Address {{ hash: {:?}, start: 10, len: 10 }} \ - extends beyond decompressed block length 16", - hash - ) - ); - } - - #[test] - pub fn write_same_data_again() { - let (_testdir, block_dir) = setup(); - - let mut example_file = make_example_file(); - let mut store = StoreFiles::new(block_dir); - let (addrs1, stats) = store - .store_file_content(&Apath::from("/ello"), &mut example_file) - .unwrap(); - assert_eq!(stats.deduplicated_blocks, 0); - assert_eq!(stats.written_blocks, 1); - assert_eq!(stats.uncompressed_bytes, 6); - assert_eq!(stats.compressed_bytes, 8); - - let mut example_file = make_example_file(); - let (addrs2, stats2) = store - .store_file_content(&Apath::from("/ello2"), &mut example_file) - .unwrap(); - assert_eq!(stats2.deduplicated_blocks, 1); - assert_eq!(stats2.written_blocks, 0); - assert_eq!(stats2.compressed_bytes, 0); - - assert_eq!(addrs1, addrs2); - } - - #[test] - // Large enough that it should break across blocks. - pub fn large_file() { - use super::MAX_BLOCK_SIZE; - let (_testdir, block_dir) = setup(); - let mut tf = NamedTempFile::new().unwrap(); - const N_CHUNKS: u64 = 10; - const CHUNK_SIZE: u64 = 1 << 21; - const TOTAL_SIZE: u64 = N_CHUNKS * CHUNK_SIZE; - let a_chunk = vec![b'@'; CHUNK_SIZE as usize]; - for _i in 0..N_CHUNKS { - tf.write_all(&a_chunk).unwrap(); - } - tf.flush().unwrap(); - let tf_len = tf.seek(SeekFrom::Current(0)).unwrap(); - println!("tf len={}", tf_len); - assert_eq!(tf_len, TOTAL_SIZE); - tf.seek(SeekFrom::Start(0)).unwrap(); - - let mut store = StoreFiles::new(block_dir.clone()); - let (addrs, stats) = store - .store_file_content(&Apath::from("/big"), &mut tf) - .unwrap(); - - // Only one block needs to get compressed. The others are deduplicated. - assert_eq!(stats.uncompressed_bytes, MAX_BLOCK_SIZE as u64); - // Should be very compressible - assert!(stats.compressed_bytes < (MAX_BLOCK_SIZE as u64 / 10)); - assert_eq!(stats.written_blocks, 1); - assert_eq!( - stats.deduplicated_blocks as u64, - TOTAL_SIZE / (MAX_BLOCK_SIZE as u64) - 1 - ); - - // 10x 2MB should be twenty blocks - assert_eq!(addrs.len(), 20); - for a in addrs { - let (retr, block_sizes) = block_dir.get(&a).unwrap(); - assert_eq!(retr.len(), MAX_BLOCK_SIZE as usize); - assert!(retr.iter().all(|b| *b == 64u8)); - assert_eq!(block_sizes.uncompressed, MAX_BLOCK_SIZE as u64); - } + pub(crate) fn hash_bytes(&self, in_buf: &[u8]) -> Result { + let mut hasher = Blake2b::new(BLAKE_HASH_SIZE_BYTES); + hasher.update(in_buf); + Ok(BlockHash::from(hasher.finalize())) } } From 08036fd946084004aed064ac89edaa90cc1eb07e Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 18 Oct 2020 08:13:28 -0700 Subject: [PATCH 13/32] Remove StoreFiles All we need here is a reusable buffer. --- src/backup.rs | 200 ++++++++++++++++++++++++++------------------------ 1 file changed, 105 insertions(+), 95 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index 5e3d632c..34092cfa 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -87,8 +87,9 @@ pub fn backup(archive: &Archive, source: &LiveTree, options: &BackupOptions) -> struct BackupWriter { band: Band, index_builder: IndexBuilder, - store_files: StoreFiles, stats: CopyStats, + block_dir: BlockDir, + buffer: Buffer, /// The index for the last stored band, used as hints for whether newly /// stored files have changed. @@ -115,7 +116,8 @@ impl BackupWriter { Ok(BackupWriter { band, index_builder, - store_files: StoreFiles::new(archive.block_dir().clone()), + buffer: Buffer::new(), + block_dir: archive.block_dir().clone(), stats: CopyStats::default(), basis_index, }) @@ -181,9 +183,12 @@ impl BackupWriter { self.stats.new_files += 1; } let read_source = from_tree.file_contents(&source_entry); - let (addrs, file_stats) = self - .store_files - .store_file_content(&apath, &mut read_source?)?; + let (addrs, file_stats) = store_file_content( + &apath, + &mut read_source?, + &mut self.block_dir, + &mut self.buffer, + )?; self.stats += file_stats; self.index_builder.push_entry(IndexEntry { addrs, @@ -200,74 +205,59 @@ impl BackupWriter { } } -/// Manages storage into the BlockDir of any number of files. -/// -/// At present this just holds a reusable input buffer. -/// -/// In future it will combine small files into aggregate blocks, -/// and perhaps compress them in parallel. -pub(crate) struct StoreFiles { - // TODO: Rename to FileWriter or similar? Perhaps doesn't need to be - // separate from BackupWriter. - block_dir: BlockDir, - input_buf: Vec, -} +/// A reusable block-sized buffer. +struct Buffer(Vec); -impl StoreFiles { - pub(crate) fn new(block_dir: BlockDir) -> StoreFiles { - StoreFiles { - block_dir, - input_buf: vec![0; MAX_BLOCK_SIZE], - } +impl Buffer { + fn new() -> Buffer { + Buffer(vec![0; MAX_BLOCK_SIZE]) } +} - pub(crate) fn store_file_content( - &mut self, - apath: &Apath, - from_file: &mut dyn Read, - ) -> Result<(Vec
, CopyStats)> { - let mut addresses = Vec::
::with_capacity(1); - let mut stats = CopyStats::default(); - loop { - // TODO: Possibly read repeatedly in case we get a short read and have room for more, - // so that short reads don't lead to short blocks being stored. - // TODO: Error should actually be an error about the source file? - // TODO: This shouldn't directly read from the source, it should take blocks in. - let read_len = - from_file - .read(&mut self.input_buf) - .map_err(|source| Error::StoreFile { - apath: apath.to_owned(), - source, - })?; - if read_len == 0 { - break; - } - let block_data = &self.input_buf[..read_len]; - let hash = self.block_dir.hash_bytes(block_data)?; - if self.block_dir.contains(&hash)? { - // TODO: Separate counter for size of the already-present blocks? - stats.deduplicated_blocks += 1; - stats.deduplicated_bytes += read_len as u64; - } else { - let comp_len = self.block_dir.compress_and_store(block_data, &hash)?; - stats.written_blocks += 1; - stats.uncompressed_bytes += read_len as u64; - stats.compressed_bytes += comp_len; - } - addresses.push(Address { - hash, - start: 0, - len: read_len as u64, - }); +fn store_file_content( + apath: &Apath, + from_file: &mut dyn Read, + block_dir: &mut BlockDir, + buffer: &mut Buffer, +) -> Result<(Vec
, CopyStats)> { + let mut addresses = Vec::
::with_capacity(1); + let mut stats = CopyStats::default(); + loop { + // TODO: Possibly read repeatedly in case we get a short read and have room for more, + // so that short reads don't lead to short blocks being stored. + let read_len = from_file + .read(&mut buffer.0) + .map_err(|source| Error::StoreFile { + apath: apath.to_owned(), + source, + })?; + if read_len == 0 { + break; } - match addresses.len() { - 0 => stats.empty_files += 1, - 1 => stats.single_block_files += 1, - _ => stats.multi_block_files += 1, + let block_data = &buffer.0[..read_len]; + let hash = block_dir.hash_bytes(block_data)?; + if block_dir.contains(&hash)? { + // TODO: Separate counter for size of the already-present blocks? + stats.deduplicated_blocks += 1; + stats.deduplicated_bytes += read_len as u64; + } else { + let comp_len = block_dir.compress_and_store(block_data, &hash)?; + stats.written_blocks += 1; + stats.uncompressed_bytes += read_len as u64; + stats.compressed_bytes += comp_len; } - Ok((addresses, stats)) + addresses.push(Address { + hash, + start: 0, + len: read_len as u64, + }); + } + match addresses.len() { + 0 => stats.empty_files += 1, + 1 => stats.single_block_files += 1, + _ => stats.multi_block_files += 1, } + Ok((addresses, stats)) } #[cfg(test)] @@ -303,15 +293,18 @@ mod tests { #[test] pub fn store_a_file() { let expected_hash = EXAMPLE_BLOCK_HASH.to_string().parse().unwrap(); - let (testdir, block_dir) = setup(); + let (testdir, mut block_dir) = setup(); let mut example_file = make_example_file(); assert_eq!(block_dir.contains(&expected_hash).unwrap(), false); - let mut store = StoreFiles::new(block_dir.clone()); - let (addrs, stats) = store - .store_file_content(&Apath::from("/hello"), &mut example_file) - .unwrap(); + let (addrs, stats) = store_file_content( + &Apath::from("/hello"), + &mut example_file, + &mut block_dir, + &mut Buffer::new(), + ) + .unwrap(); // Should be in one block, with the expected hash. assert_eq!(1, addrs.len()); @@ -361,11 +354,14 @@ mod tests { #[test] fn retrieve_partial_data() { - let (_testdir, block_dir) = setup(); - let mut store_files = StoreFiles::new(block_dir.clone()); - let (addrs, _stats) = store_files - .store_file_content(&"/hello".into(), &mut Cursor::new(b"0123456789abcdef")) - .unwrap(); + let (_testdir, mut block_dir) = setup(); + let (addrs, _stats) = store_file_content( + &"/hello".into(), + &mut Cursor::new(b"0123456789abcdef"), + &mut block_dir, + &mut Buffer::new(), + ) + .unwrap(); assert_eq!(addrs.len(), 1); let hash = addrs[0].hash.clone(); let first_half = Address { @@ -388,11 +384,14 @@ mod tests { #[test] fn invalid_addresses() { - let (_testdir, block_dir) = setup(); - let mut store_files = StoreFiles::new(block_dir.clone()); - let (addrs, _stats) = store_files - .store_file_content(&"/hello".into(), &mut Cursor::new(b"0123456789abcdef")) - .unwrap(); + let (_testdir, mut block_dir) = setup(); + let (addrs, _stats) = store_file_content( + &"/hello".into(), + &mut Cursor::new(b"0123456789abcdef"), + &mut block_dir, + &mut Buffer::new(), + ) + .unwrap(); assert_eq!(addrs.len(), 1); // Address with start point too high. @@ -431,22 +430,30 @@ mod tests { #[test] pub fn write_same_data_again() { - let (_testdir, block_dir) = setup(); + let (_testdir, mut block_dir) = setup(); let mut example_file = make_example_file(); - let mut store = StoreFiles::new(block_dir); - let (addrs1, stats) = store - .store_file_content(&Apath::from("/ello"), &mut example_file) - .unwrap(); + let mut buffer = Buffer::new(); + let (addrs1, stats) = store_file_content( + &Apath::from("/ello"), + &mut example_file, + &mut block_dir, + &mut buffer, + ) + .unwrap(); assert_eq!(stats.deduplicated_blocks, 0); assert_eq!(stats.written_blocks, 1); assert_eq!(stats.uncompressed_bytes, 6); assert_eq!(stats.compressed_bytes, 8); let mut example_file = make_example_file(); - let (addrs2, stats2) = store - .store_file_content(&Apath::from("/ello2"), &mut example_file) - .unwrap(); + let (addrs2, stats2) = store_file_content( + &Apath::from("/ello2"), + &mut example_file, + &mut block_dir, + &mut buffer, + ) + .unwrap(); assert_eq!(stats2.deduplicated_blocks, 1); assert_eq!(stats2.written_blocks, 0); assert_eq!(stats2.compressed_bytes, 0); @@ -458,7 +465,7 @@ mod tests { // Large enough that it should break across blocks. pub fn large_file() { use super::MAX_BLOCK_SIZE; - let (_testdir, block_dir) = setup(); + let (_testdir, mut block_dir) = setup(); let mut tf = NamedTempFile::new().unwrap(); const N_CHUNKS: u64 = 10; const CHUNK_SIZE: u64 = 1 << 21; @@ -473,10 +480,13 @@ mod tests { assert_eq!(tf_len, TOTAL_SIZE); tf.seek(SeekFrom::Start(0)).unwrap(); - let mut store = StoreFiles::new(block_dir.clone()); - let (addrs, stats) = store - .store_file_content(&Apath::from("/big"), &mut tf) - .unwrap(); + let (addrs, stats) = store_file_content( + &Apath::from("/big"), + &mut tf, + &mut block_dir, + &mut Buffer::new(), + ) + .unwrap(); // Only one block needs to get compressed. The others are deduplicated. assert_eq!(stats.uncompressed_bytes, MAX_BLOCK_SIZE as u64); From 31cf0158d4808c4509fcb8973541acafa80c2416 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 18 Oct 2020 08:34:48 -0700 Subject: [PATCH 14/32] Add a test for splitting backups across hunks --- tests/backup.rs | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 tests/backup.rs diff --git a/tests/backup.rs b/tests/backup.rs new file mode 100644 index 00000000..3ae58101 --- /dev/null +++ b/tests/backup.rs @@ -0,0 +1,41 @@ +// Copyright 2015, 2016, 2017, 2019, 2020 Martin Pool. + +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +//! Tests focussed on backup behavior. + +use conserve::test_fixtures::ScratchArchive; +use conserve::test_fixtures::TreeFixture; +use conserve::*; + +#[test] +pub fn many_files_multiple_hunks() { + let af = ScratchArchive::new(); + let srcdir = TreeFixture::new(); + // The directory also counts as an entry, so we should be able to fit 1999 + // files in 2 hunks of 1000 entries. + for i in 0..1999 { + srcdir.create_file(&format!("file{}", i)); + } + let stats = backup(&af, &srcdir.live_tree(), &BackupOptions::default()).expect("backup"); + assert_eq!( + stats.index_builder_stats.index_hunks, 2, + "expect exactly 2 hunks" + ); + assert_eq!(stats.files, 1999); + assert_eq!(stats.directories, 1); + assert_eq!(stats.unknown_kind, 0); + + assert_eq!(stats.new_files, 1999); + assert_eq!(stats.single_block_files, 1999); + assert_eq!(stats.errors, 0); + assert_eq!(stats.written_blocks, 1); +} From cd0ca30fccc74ff829043b934892bd2cb56da851 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 18 Oct 2020 08:41:21 -0700 Subject: [PATCH 15/32] Check names in many-file tree --- tests/backup.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/backup.rs b/tests/backup.rs index 3ae58101..f45120f7 100644 --- a/tests/backup.rs +++ b/tests/backup.rs @@ -23,7 +23,7 @@ pub fn many_files_multiple_hunks() { // The directory also counts as an entry, so we should be able to fit 1999 // files in 2 hunks of 1000 entries. for i in 0..1999 { - srcdir.create_file(&format!("file{}", i)); + srcdir.create_file(&format!("file{:04}", i)); } let stats = backup(&af, &srcdir.live_tree(), &BackupOptions::default()).expect("backup"); assert_eq!( @@ -37,5 +37,13 @@ pub fn many_files_multiple_hunks() { assert_eq!(stats.new_files, 1999); assert_eq!(stats.single_block_files, 1999); assert_eq!(stats.errors, 0); + // They all have the same content. assert_eq!(stats.written_blocks, 1); + + let tree = af.open_stored_tree(BandSelectionPolicy::Latest).unwrap(); + let mut entry_iter = tree.iter_entries().unwrap(); + assert_eq!(entry_iter.next().unwrap().apath(), "/"); + for (i, entry) in entry_iter.enumerate() { + assert_eq!(entry.apath().to_string(), format!("/file{:04}", i)); + } } From e53a69d4d3bde3aa0a9822b9086ef649e9403f05 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 18 Oct 2020 08:50:18 -0700 Subject: [PATCH 16/32] Move responsibility for breaking index hunks to backup The backup code needs to know about index hunk boundaries so that it can combine and parallelize storage of data within that hunk. And so the higher-level backup code now tells the index when it's done. --- src/backup.rs | 45 +++++++++++++++++++++++++++------------------ src/index.rs | 27 ++++++--------------------- src/stitch.rs | 28 ++++++++++++++-------------- 3 files changed, 47 insertions(+), 53 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index 34092cfa..974b0eb1 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -35,6 +35,18 @@ pub struct BackupOptions { pub excludes: Option, } +// This causes us to walk the source tree twice, which is probably an acceptable option +// since it's nice to see realistic overall progress. We could keep all the entries +// in memory, and maybe we should, but it might get unreasonably big. +// if options.measure_first { +// progress_bar.set_phase("Measure source tree".to_owned()); +// // TODO: Maybe read all entries for the source tree in to memory now, rather than walking it +// // again a second time? But, that'll potentially use memory proportional to tree size, which +// // I'd like to avoid, and also perhaps make it more likely we grumble about files that were +// // deleted or changed while this is running. +// progress_bar.set_bytes_total(source.size()?.file_bytes as u64); +// } + /// Backup a source directory into a new band in the archive. /// /// Returns statistics about what was copied. @@ -42,18 +54,6 @@ pub fn backup(archive: &Archive, source: &LiveTree, options: &BackupOptions) -> let mut writer = BackupWriter::begin(archive)?; let mut stats = CopyStats::default(); let mut progress_bar = ProgressBar::new(); - // This causes us to walk the source tree twice, which is probably an acceptable option - // since it's nice to see realistic overall progress. We could keep all the entries - // in memory, and maybe we should, but it might get unreasonably big. - - // if options.measure_first { - // progress_bar.set_phase("Measure source tree".to_owned()); - // // TODO: Maybe read all entries for the source tree in to memory now, rather than walking it - // // again a second time? But, that'll potentially use memory proportional to tree size, which - // // I'd like to avoid, and also perhaps make it more likely we grumble about files that were - // // deleted or changed while this is running. - // progress_bar.set_bytes_total(source.size()?.file_bytes as u64); - // } progress_bar.set_phase("Copying".to_owned()); let entry_iter = source.iter_filtered(None, options.excludes.clone())?; @@ -75,8 +75,7 @@ pub fn backup(archive: &Archive, source: &LiveTree, options: &BackupOptions) -> progress_bar.increment_bytes_done(bytes); } } - // TODO: Finish any compression groups. - // TODO: Sort and then write out the index hunk for this group. + writer.flush_group()?; } stats += writer.finish()?; // TODO: Merge in stats from the tree iter and maybe the source tree? @@ -132,6 +131,13 @@ impl BackupWriter { }) } + /// Write out any pending data blocks, and then the pending index entries. + fn flush_group(&mut self) -> Result<()> { + // TODO: Finish any compression groups. + // TODO: Sort and then write out the index hunk for this group. + self.index_builder.finish_hunk() + } + fn copy_entry(&mut self, entry: &R::Entry, source: &R) -> Result<()> { match entry.kind() { Kind::Dir => self.copy_dir(entry), @@ -151,7 +157,8 @@ impl BackupWriter { // TODO: Pass back index sizes self.stats.directories += 1; self.index_builder - .push_entry(IndexEntry::metadata_from(source_entry)) + .push_entry(IndexEntry::metadata_from(source_entry)); + Ok(()) } /// Copy in the contents of a file from another tree. @@ -174,7 +181,7 @@ impl BackupWriter { // with the archive invariants, which include that all the // blocks referenced by the index, are actually present. self.stats.unmodified_files += 1; - self.index_builder.push_entry(basis_entry)?; + self.index_builder.push_entry(basis_entry); return Ok(()); } else { self.stats.modified_files += 1; @@ -193,7 +200,8 @@ impl BackupWriter { self.index_builder.push_entry(IndexEntry { addrs, ..IndexEntry::metadata_from(source_entry) - }) + }); + Ok(()) } fn copy_symlink(&mut self, source_entry: &E) -> Result<()> { @@ -201,7 +209,8 @@ impl BackupWriter { self.stats.symlinks += 1; assert!(target.is_some()); self.index_builder - .push_entry(IndexEntry::metadata_from(source_entry)) + .push_entry(IndexEntry::metadata_from(source_entry)); + Ok(()) } } diff --git a/src/index.rs b/src/index.rs index 7872e1dc..ff4610a4 100644 --- a/src/index.rs +++ b/src/index.rs @@ -159,6 +159,7 @@ impl IndexBuilder { } } + /// Finish the last hunk of this index, and return the stats. pub fn finish(mut self) -> Result { self.finish_hunk()?; Ok(self.stats) @@ -167,20 +168,9 @@ impl IndexBuilder { /// Append an entry to the index. /// /// The new entry must sort after everything already written to the index. - pub(crate) fn push_entry(&mut self, entry: IndexEntry) -> Result<()> { - // We do this check here rather than the Index constructor so that we - // can still read invalid apaths... + pub(crate) fn push_entry(&mut self, entry: IndexEntry) { self.check_order.check(&entry.apath); self.entries.push(entry); - if self.entries.len() >= MAX_ENTRIES_PER_HUNK { - self.finish_hunk() - } else { - Ok(()) - } - } - - pub fn flush(&mut self) -> Result<()> { - self.finish_hunk() } /// Finish this hunk of the index. @@ -188,11 +178,10 @@ impl IndexBuilder { /// This writes all the currently queued entries into a new index file /// in the band directory, and then clears the buffer to start receiving /// entries for the next hunk. - fn finish_hunk(&mut self) -> Result<()> { + pub fn finish_hunk(&mut self) -> Result<()> { if self.entries.is_empty() { return Ok(()); } - let relpath = hunk_relpath(self.sequence); let write_error = |source| Error::WriteIndex { path: relpath.clone(), @@ -483,7 +472,6 @@ mod tests { addrs: vec![], target: None, }) - .unwrap(); } #[test] @@ -518,8 +506,7 @@ mod tests { kind: Kind::File, addrs: vec![], target: None, - }) - .unwrap(); + }); ib.push_entry(IndexEntry { apath: "aaa".into(), mtime: 1_461_736_377, @@ -527,8 +514,7 @@ mod tests { kind: Kind::File, addrs: vec![], target: None, - }) - .unwrap(); + }); } #[test] @@ -542,8 +528,7 @@ mod tests { addrs: vec![], mtime_nanos: 0, target: None, - }) - .unwrap(); + }); } #[test] diff --git a/src/stitch.rs b/src/stitch.rs index a5b0684a..337be17f 100644 --- a/src/stitch.rs +++ b/src/stitch.rs @@ -148,10 +148,10 @@ mod test { let band = Band::create(&af)?; assert_eq!(*band.id(), BandId::zero()); let mut ib = band.index_builder(); - ib.push_entry(symlink("/0", "b0"))?; - ib.push_entry(symlink("/1", "b0"))?; - ib.flush()?; - ib.push_entry(symlink("/2", "b0"))?; + ib.push_entry(symlink("/0", "b0")); + ib.push_entry(symlink("/1", "b0")); + ib.finish_hunk()?; + ib.push_entry(symlink("/2", "b0")); // Flush this hunk but leave the band incomplete. let stats = ib.finish()?; assert_eq!(stats.index_hunks, 2); @@ -159,11 +159,11 @@ mod test { let band = Band::create(&af)?; assert_eq!(band.id().to_string(), "b0001"); let mut ib = band.index_builder(); - ib.push_entry(symlink("/0", "b1"))?; - ib.push_entry(symlink("/1", "b1"))?; - ib.flush()?; - ib.push_entry(symlink("/2", "b1"))?; - ib.push_entry(symlink("/3", "b1"))?; + ib.push_entry(symlink("/0", "b1")); + ib.push_entry(symlink("/1", "b1")); + ib.finish_hunk()?; + ib.push_entry(symlink("/2", "b1")); + ib.push_entry(symlink("/3", "b1")); let stats = ib.finish()?; assert_eq!(stats.index_hunks, 2); band.close(2)?; @@ -172,9 +172,9 @@ mod test { let band = Band::create(&af)?; assert_eq!(band.id().to_string(), "b0002"); let mut ib = band.index_builder(); - ib.push_entry(symlink("/0", "b2"))?; - ib.flush()?; - ib.push_entry(symlink("/2", "b2"))?; + ib.push_entry(symlink("/0", "b2")); + ib.finish_hunk()?; + ib.push_entry(symlink("/2", "b2")); // incomplete let stats = ib.finish()?; assert_eq!(stats.index_hunks, 2); @@ -191,8 +191,8 @@ mod test { let band = Band::create(&af)?; assert_eq!(band.id().to_string(), "b0005"); let mut ib = band.index_builder(); - ib.push_entry(symlink("/0", "b5"))?; - ib.push_entry(symlink("/00", "b5"))?; + ib.push_entry(symlink("/0", "b5")); + ib.push_entry(symlink("/00", "b5")); let stats = ib.finish()?; assert_eq!(stats.index_hunks, 1); // incomplete From aea784aa3ac4721cf88f1174bd88b97b88214b84 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 18 Oct 2020 09:35:44 -0700 Subject: [PATCH 17/32] Tidy up --- src/backup.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index 974b0eb1..8bf60367 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -71,9 +71,7 @@ pub fn backup(archive: &Archive, source: &LiveTree, options: &BackupOptions) -> stats.errors += 1; continue; } - if let Some(bytes) = entry.size() { - progress_bar.increment_bytes_done(bytes); - } + progress_bar.increment_bytes_done(entry.size().unwrap_or(0)); } writer.flush_group()?; } @@ -134,7 +132,6 @@ impl BackupWriter { /// Write out any pending data blocks, and then the pending index entries. fn flush_group(&mut self) -> Result<()> { // TODO: Finish any compression groups. - // TODO: Sort and then write out the index hunk for this group. self.index_builder.finish_hunk() } From 6a8367eaef2493c95c0d1d3fbb8f14ed185c23e1 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 18 Oct 2020 10:50:56 -0700 Subject: [PATCH 18/32] Add FileCombiner --- src/backup.rs | 113 +++++++++++++++++++++++++++++++++++++++++++++----- src/lib.rs | 6 +++ 2 files changed, 108 insertions(+), 11 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index 8bf60367..d2cbed29 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -14,6 +14,7 @@ //! Make a backup by walking a source directory and copying the contents //! into an archive. +use std::convert::TryInto; use std::io::prelude::*; use globset::GlobSet; @@ -241,17 +242,7 @@ fn store_file_content( break; } let block_data = &buffer.0[..read_len]; - let hash = block_dir.hash_bytes(block_data)?; - if block_dir.contains(&hash)? { - // TODO: Separate counter for size of the already-present blocks? - stats.deduplicated_blocks += 1; - stats.deduplicated_bytes += read_len as u64; - } else { - let comp_len = block_dir.compress_and_store(block_data, &hash)?; - stats.written_blocks += 1; - stats.uncompressed_bytes += read_len as u64; - stats.compressed_bytes += comp_len; - } + let hash = store_or_deduplicate(block_data, block_dir, &mut stats)?; addresses.push(Address { hash, start: 0, @@ -266,6 +257,106 @@ fn store_file_content( Ok((addresses, stats)) } +fn store_or_deduplicate( + block_data: &[u8], + block_dir: &mut BlockDir, + stats: &mut CopyStats, +) -> Result { + let hash = block_dir.hash_bytes(block_data)?; + if block_dir.contains(&hash)? { + stats.deduplicated_blocks += 1; + stats.deduplicated_bytes += block_data.len() as u64; + } else { + let comp_len = block_dir.compress_and_store(block_data, &hash)?; + stats.written_blocks += 1; + stats.uncompressed_bytes += block_data.len() as u64; + stats.compressed_bytes += comp_len; + } + Ok(hash) +} + +/// Combines multiple small files into a single block. +/// +/// When the block is finished, and only then, this returns the index entries with the addresses +/// completed. +struct FileCombiner { + /// Buffer of concatenated data from small files. + buf: Vec, + queue: Vec, +} + +/// A file in the process of being written into a combined block. +/// +/// While this exists, the data has been stored into the combine buffer, and we know +/// the offset and length. But since the combine buffer hasn't been finished and hashed, +/// we do not yet know a full address. +struct QueuedFile { + /// Offset of the start of the data from this file within `buf`. + start: usize, + /// Length of data in this file. + len: usize, + /// IndexEntry without addresses. + entry: IndexEntry, +} + +impl FileCombiner { + fn new() -> FileCombiner { + FileCombiner { + buf: Vec::new(), + queue: Vec::new(), + } + } + + /// Write all the content from the combined block to a blockdir. + /// + /// Returns the fully populated entries for all files in this combined block. + /// + /// After this call the FileCombiner is empty and can be reused for more files into a new + /// block. + fn flush(&mut self, block_dir: &mut BlockDir) -> Result<(CopyStats, Vec)> { + let mut stats = CopyStats::default(); + let hash: BlockHash = store_or_deduplicate(&self.buf, block_dir, &mut stats)?; + self.buf.clear(); + let finished_entries = self + .queue + .drain(..) + .map(|qf| IndexEntry { + addrs: vec![Address { + hash: hash.clone(), + start: qf.start.try_into().unwrap(), + len: qf.len.try_into().unwrap(), + }], + ..qf.entry + }) + .collect(); + Ok((stats, finished_entries)) + } + + /// Add the contents of a small file into this combiner. + /// + /// `entry` should be an IndexEntry that's complete apart from the block addresses. + fn push_file(&mut self, entry: IndexEntry, from_file: &mut dyn Read) -> Result<()> { + let start = self.buf.len(); + let expected_len: usize = entry + .size() + .expect("small file has no length") + .try_into() + .unwrap(); + assert!(expected_len > 0, "small file is empty"); + self.buf.resize(start + expected_len, 0); + let len = from_file + .read(&mut self.buf[start..]) + .map_err(|source| Error::StoreFile { + apath: entry.apath.to_owned(), + source, + })?; + // TODO: Maybe check this is actually the end of the file. + self.buf.truncate(start + len); + self.queue.push(QueuedFile { start, len, entry }); + Ok(()) + } +} + #[cfg(test)] mod tests { use std::fs; diff --git a/src/lib.rs b/src/lib.rs index 6766999d..7ae48c5a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -91,6 +91,12 @@ pub const SYMLINKS_SUPPORTED: bool = cfg!(target_family = "unix"); /// Break blocks at this many uncompressed bytes. pub(crate) const MAX_BLOCK_SIZE: usize = 1 << 20; +/// Maximum file size that will be combined with others rather than being stored alone. +const SMALL_FILE_CAP: u64 = 100_000; + +/// Target maximum uncompressed size for combined blocks. +const TARGET_COMBINED_BLOCK_SIZE: usize = MAX_BLOCK_SIZE; + /// ISO timestamp, for https://docs.rs/chrono/0.4.11/chrono/format/strftime/. const TIMESTAMP_FORMAT: &str = "%F %T"; From 63840e93a1284bee4ffe039d384e026499ab720b Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 25 Oct 2020 14:31:11 -0700 Subject: [PATCH 19/32] More progress towards combining small files --- src/backup.rs | 168 ++++++++++++++++++++++++++++++++++++----------- src/blockdir.rs | 18 +++++ src/lib.rs | 2 +- src/live_tree.rs | 10 +-- 4 files changed, 152 insertions(+), 46 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index d2cbed29..711b89a5 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -242,7 +242,7 @@ fn store_file_content( break; } let block_data = &buffer.0[..read_len]; - let hash = store_or_deduplicate(block_data, block_dir, &mut stats)?; + let hash = block_dir.store_or_deduplicate(block_data, &mut stats)?; addresses.push(Address { hash, start: 0, @@ -257,24 +257,6 @@ fn store_file_content( Ok((addresses, stats)) } -fn store_or_deduplicate( - block_data: &[u8], - block_dir: &mut BlockDir, - stats: &mut CopyStats, -) -> Result { - let hash = block_dir.hash_bytes(block_data)?; - if block_dir.contains(&hash)? { - stats.deduplicated_blocks += 1; - stats.deduplicated_bytes += block_data.len() as u64; - } else { - let comp_len = block_dir.compress_and_store(block_data, &hash)?; - stats.written_blocks += 1; - stats.uncompressed_bytes += block_data.len() as u64; - stats.compressed_bytes += comp_len; - } - Ok(hash) -} - /// Combines multiple small files into a single block. /// /// When the block is finished, and only then, this returns the index entries with the addresses @@ -283,6 +265,10 @@ struct FileCombiner { /// Buffer of concatenated data from small files. buf: Vec, queue: Vec, + /// Entries for files that have been written to the blockdir, and that have complete addresses. + finished: Vec, + stats: CopyStats, + block_dir: BlockDir, } /// A file in the process of being written into a combined block. @@ -300,60 +286,88 @@ struct QueuedFile { } impl FileCombiner { - fn new() -> FileCombiner { + fn new(block_dir: BlockDir) -> FileCombiner { FileCombiner { + block_dir, buf: Vec::new(), queue: Vec::new(), + finished: Vec::new(), + stats: CopyStats::default(), } } + /// Flush any pending files, and return accumulated file entries and stats. + fn drain(mut self) -> Result<(CopyStats, Vec)> { + self.flush()?; + Ok((self.stats, self.finished)) + } + /// Write all the content from the combined block to a blockdir. /// /// Returns the fully populated entries for all files in this combined block. /// /// After this call the FileCombiner is empty and can be reused for more files into a new /// block. - fn flush(&mut self, block_dir: &mut BlockDir) -> Result<(CopyStats, Vec)> { - let mut stats = CopyStats::default(); - let hash: BlockHash = store_or_deduplicate(&self.buf, block_dir, &mut stats)?; + fn flush(&mut self) -> Result<()> { + if self.queue.is_empty() { + assert!(self.buf.is_empty()); + return Ok(()); + } + let hash = self + .block_dir + .store_or_deduplicate(&self.buf, &mut self.stats)?; self.buf.clear(); - let finished_entries = self - .queue - .drain(..) - .map(|qf| IndexEntry { + self.finished + .extend(self.queue.drain(..).map(|qf| IndexEntry { addrs: vec![Address { hash: hash.clone(), start: qf.start.try_into().unwrap(), len: qf.len.try_into().unwrap(), }], ..qf.entry - }) - .collect(); - Ok((stats, finished_entries)) + })); + Ok(()) } /// Add the contents of a small file into this combiner. /// /// `entry` should be an IndexEntry that's complete apart from the block addresses. - fn push_file(&mut self, entry: IndexEntry, from_file: &mut dyn Read) -> Result<()> { + fn push_file(&mut self, live_entry: &LiveEntry, from_file: &mut dyn Read) -> Result<()> { let start = self.buf.len(); - let expected_len: usize = entry + let expected_len: usize = live_entry .size() .expect("small file has no length") .try_into() .unwrap(); - assert!(expected_len > 0, "small file is empty"); + let index_entry = IndexEntry::metadata_from(live_entry); + if expected_len == 0 { + panic!(); + self.finished.push(index_entry); + return Ok(()); + } self.buf.resize(start + expected_len, 0); let len = from_file .read(&mut self.buf[start..]) .map_err(|source| Error::StoreFile { - apath: entry.apath.to_owned(), + apath: live_entry.apath().to_owned(), source, })?; - // TODO: Maybe check this is actually the end of the file. self.buf.truncate(start + len); - self.queue.push(QueuedFile { start, len, entry }); - Ok(()) + if len == 0 { + panic!(); + self.finished.push(index_entry); + } else { + self.queue.push(QueuedFile { + start, + len, + entry: index_entry, + }); + } + if self.buf.len() >= TARGET_COMBINED_BLOCK_SIZE { + self.flush() + } else { + Ok(()) + } } } @@ -366,6 +380,7 @@ mod tests { use tempfile::{NamedTempFile, TempDir}; use crate::stats::Sizes; + use crate::unix_time::UnixTime; use super::*; @@ -388,7 +403,7 @@ mod tests { } #[test] - pub fn store_a_file() { + fn store_a_file() { let expected_hash = EXAMPLE_BLOCK_HASH.to_string().parse().unwrap(); let (testdir, mut block_dir) = setup(); let mut example_file = make_example_file(); @@ -526,7 +541,7 @@ mod tests { } #[test] - pub fn write_same_data_again() { + fn write_same_data_again() { let (_testdir, mut block_dir) = setup(); let mut example_file = make_example_file(); @@ -560,7 +575,7 @@ mod tests { #[test] // Large enough that it should break across blocks. - pub fn large_file() { + fn large_file() { use super::MAX_BLOCK_SIZE; let (_testdir, mut block_dir) = setup(); let mut tf = NamedTempFile::new().unwrap(); @@ -604,4 +619,77 @@ mod tests { assert_eq!(block_sizes.uncompressed, MAX_BLOCK_SIZE as u64); } } + + #[test] + fn combine_one_file() { + let testdir = TempDir::new().unwrap(); + let block_dir = BlockDir::create_path(testdir.path()).unwrap(); + let mut combiner = FileCombiner::new(block_dir); + let file_bytes = b"some stuff"; + let entry = LiveEntry { + apath: Apath::from("/0"), + kind: Kind::File, + mtime: UnixTime { + secs: 1603116230, + nanosecs: 0, + }, + symlink_target: None, + size: Some(file_bytes.len() as u64), + }; + let mut content = Cursor::new(file_bytes); + combiner.push_file(&entry, &mut content).unwrap(); + let (stats, entries) = combiner.drain().unwrap(); + assert_eq!(entries.len(), 1); + let addrs = entries[0].addrs.clone(); + assert_eq!(addrs.len(), 1, "combined file should have one block"); + assert_eq!(addrs[0].start, 0); + assert_eq!(addrs[0].len, 10); + let expected_entry = IndexEntry { + addrs, + ..IndexEntry::metadata_from(&entry) + }; + assert_eq!(entries[0], expected_entry); + assert_eq!(stats.uncompressed_bytes, 10); + assert_eq!(stats.written_blocks, 1); + } + + #[test] + fn combine_several_small_files() { + let testdir = TempDir::new().unwrap(); + let block_dir = BlockDir::create_path(testdir.path()).unwrap(); + let mut combiner = FileCombiner::new(block_dir); + let file_bytes = b"some stuff"; + let mut live_entry = LiveEntry { + apath: Apath::from("/0"), + kind: Kind::File, + mtime: UnixTime { + secs: 1603116230, + nanosecs: 0, + }, + symlink_target: None, + size: Some(file_bytes.len() as u64), + }; + for i in 0..10 { + live_entry.apath = Apath::from(format!("/{:02}", i)); + let mut content = Cursor::new(file_bytes); + combiner.push_file(&live_entry, &mut content).unwrap(); + } + let (stats, entries) = combiner.drain().unwrap(); + assert_eq!(entries.len(), 10); + + let first_hash = &entries[0].addrs[0].hash; + + for (i, entry) in entries.iter().enumerate() { + assert_eq!( + entry.addrs, + &[Address { + hash: first_hash.clone(), + start: i as u64 * 10, + len: file_bytes.len() as u64 + }] + ); + } + assert_eq!(stats.uncompressed_bytes, 100); + assert_eq!(stats.written_blocks, 1); + } } diff --git a/src/blockdir.rs b/src/blockdir.rs index 1e03b49d..442fa2b6 100644 --- a/src/blockdir.rs +++ b/src/blockdir.rs @@ -131,6 +131,24 @@ impl BlockDir { Ok(comp_len) } + pub(crate) fn store_or_deduplicate( + &mut self, + block_data: &[u8], + stats: &mut CopyStats, + ) -> Result { + let hash = self.hash_bytes(block_data)?; + if self.contains(&hash)? { + stats.deduplicated_blocks += 1; + stats.deduplicated_bytes += block_data.len() as u64; + } else { + let comp_len = self.compress_and_store(block_data, &hash)?; + stats.written_blocks += 1; + stats.uncompressed_bytes += block_data.len() as u64; + stats.compressed_bytes += comp_len; + } + Ok(hash) + } + /// True if the named block is present in this directory. pub fn contains(&self, hash: &BlockHash) -> Result { self.transport diff --git a/src/lib.rs b/src/lib.rs index 7ae48c5a..0ae921f4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -66,7 +66,7 @@ pub use crate::merge::{MergeTrees, MergedEntryKind}; pub use crate::misc::bytes_to_human_mb; pub use crate::progress::ProgressBar; pub use crate::restore::{RestoreOptions, RestoreTree}; -pub use crate::stats::{DeleteStats, ValidateStats}; +pub use crate::stats::{CopyStats, DeleteStats, ValidateStats}; pub use crate::stored_tree::StoredTree; pub use crate::tree::{ReadBlocks, ReadTree, TreeSize, WriteTree}; diff --git a/src/live_tree.rs b/src/live_tree.rs index 822f82ee..68185ae3 100644 --- a/src/live_tree.rs +++ b/src/live_tree.rs @@ -48,11 +48,11 @@ impl LiveTree { /// An in-memory Entry describing a file/dir/symlink in a live tree. #[derive(Debug, Clone, Eq, PartialEq)] pub struct LiveEntry { - apath: Apath, - kind: Kind, - mtime: UnixTime, - size: Option, - symlink_target: Option, + pub(crate) apath: Apath, + pub(crate) kind: Kind, + pub(crate) mtime: UnixTime, + pub(crate) size: Option, + pub(crate) symlink_target: Option, } fn relative_path(root: &Path, apath: &Apath) -> PathBuf { From 21a493cc43e8be2f67233601658f0f7efc99cf71 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sun, 25 Oct 2020 14:46:19 -0700 Subject: [PATCH 20/32] Further refactor towards combining files --- src/backup.rs | 42 ++++++++++++++++++++++++++++-------------- src/stats.rs | 1 + 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index 711b89a5..cc0b3b26 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -188,13 +188,13 @@ impl BackupWriter { self.stats.new_files += 1; } let read_source = from_tree.file_contents(&source_entry); - let (addrs, file_stats) = store_file_content( + let addrs = store_file_content( &apath, &mut read_source?, &mut self.block_dir, &mut self.buffer, + &mut self.stats, )?; - self.stats += file_stats; self.index_builder.push_entry(IndexEntry { addrs, ..IndexEntry::metadata_from(source_entry) @@ -226,9 +226,9 @@ fn store_file_content( from_file: &mut dyn Read, block_dir: &mut BlockDir, buffer: &mut Buffer, -) -> Result<(Vec
, CopyStats)> { + stats: &mut CopyStats, +) -> Result> { let mut addresses = Vec::
::with_capacity(1); - let mut stats = CopyStats::default(); loop { // TODO: Possibly read repeatedly in case we get a short read and have room for more, // so that short reads don't lead to short blocks being stored. @@ -242,7 +242,7 @@ fn store_file_content( break; } let block_data = &buffer.0[..read_len]; - let hash = block_dir.store_or_deduplicate(block_data, &mut stats)?; + let hash = block_dir.store_or_deduplicate(block_data, stats)?; addresses.push(Address { hash, start: 0, @@ -254,7 +254,7 @@ fn store_file_content( 1 => stats.single_block_files += 1, _ => stats.multi_block_files += 1, } - Ok((addresses, stats)) + Ok(addresses) } /// Combines multiple small files into a single block. @@ -341,7 +341,6 @@ impl FileCombiner { .unwrap(); let index_entry = IndexEntry::metadata_from(live_entry); if expected_len == 0 { - panic!(); self.finished.push(index_entry); return Ok(()); } @@ -354,9 +353,10 @@ impl FileCombiner { })?; self.buf.truncate(start + len); if len == 0 { - panic!(); + self.stats.empty_files += 1; self.finished.push(index_entry); } else { + self.stats.small_combined_files += 1; self.queue.push(QueuedFile { start, len, @@ -410,11 +410,13 @@ mod tests { assert_eq!(block_dir.contains(&expected_hash).unwrap(), false); - let (addrs, stats) = store_file_content( + let mut stats = CopyStats::default(); + let addrs = store_file_content( &Apath::from("/hello"), &mut example_file, &mut block_dir, &mut Buffer::new(), + &mut stats, ) .unwrap(); @@ -467,11 +469,12 @@ mod tests { #[test] fn retrieve_partial_data() { let (_testdir, mut block_dir) = setup(); - let (addrs, _stats) = store_file_content( + let addrs = store_file_content( &"/hello".into(), &mut Cursor::new(b"0123456789abcdef"), &mut block_dir, &mut Buffer::new(), + &mut CopyStats::default(), ) .unwrap(); assert_eq!(addrs.len(), 1); @@ -497,11 +500,12 @@ mod tests { #[test] fn invalid_addresses() { let (_testdir, mut block_dir) = setup(); - let (addrs, _stats) = store_file_content( + let addrs = store_file_content( &"/hello".into(), &mut Cursor::new(b"0123456789abcdef"), &mut block_dir, &mut Buffer::new(), + &mut CopyStats::default(), ) .unwrap(); assert_eq!(addrs.len(), 1); @@ -546,11 +550,13 @@ mod tests { let mut example_file = make_example_file(); let mut buffer = Buffer::new(); - let (addrs1, stats) = store_file_content( + let mut stats = CopyStats::default(); + let addrs1 = store_file_content( &Apath::from("/ello"), &mut example_file, &mut block_dir, &mut buffer, + &mut stats, ) .unwrap(); assert_eq!(stats.deduplicated_blocks, 0); @@ -559,11 +565,13 @@ mod tests { assert_eq!(stats.compressed_bytes, 8); let mut example_file = make_example_file(); - let (addrs2, stats2) = store_file_content( + let mut stats2 = CopyStats::default(); + let addrs2 = store_file_content( &Apath::from("/ello2"), &mut example_file, &mut block_dir, &mut buffer, + &mut stats2, ) .unwrap(); assert_eq!(stats2.deduplicated_blocks, 1); @@ -592,11 +600,13 @@ mod tests { assert_eq!(tf_len, TOTAL_SIZE); tf.seek(SeekFrom::Start(0)).unwrap(); - let (addrs, stats) = store_file_content( + let mut stats = CopyStats::default(); + let addrs = store_file_content( &Apath::from("/big"), &mut tf, &mut block_dir, &mut Buffer::new(), + &mut stats, ) .unwrap(); @@ -689,6 +699,10 @@ mod tests { }] ); } + assert_eq!(stats.small_combined_files, 10); + assert_eq!(stats.empty_files, 0); + assert_eq!(stats.single_block_files, 0); + assert_eq!(stats.multi_block_files, 0); assert_eq!(stats.uncompressed_bytes, 100); assert_eq!(stats.written_blocks, 1); } diff --git a/src/stats.rs b/src/stats.rs index c10bfda8..f5424317 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -132,6 +132,7 @@ pub struct CopyStats { pub written_blocks: usize, pub empty_files: usize, + pub small_combined_files: usize, pub single_block_files: usize, pub multi_block_files: usize, From ba4a33b6fb8880ec8e1db7d220907645f31c1fc5 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Thu, 29 Oct 2020 07:06:06 -0700 Subject: [PATCH 21/32] Sort entries just before writing index hunks Previously the API required they be added to the IndexBuilder in order, but that gets in the way of grouping together small files and parallelizing compression. Instead, allow them to be added in arbitrary order within the hunk, and the IndexWriter will sort before serializing. Backup code is still responsible for keeping them in order across hunks, and the IndexWriter checks this. Also, rename IndexBuilder to IndexWriter. --- src/backup.rs | 4 +- src/band.rs | 4 +- src/index.rs | 191 ++++++++++++++++++++++++-------------------------- src/lib.rs | 2 +- src/stats.rs | 4 +- 5 files changed, 100 insertions(+), 105 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index cc0b3b26..41ba9cf1 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -84,7 +84,7 @@ pub fn backup(archive: &Archive, source: &LiveTree, options: &BackupOptions) -> /// Accepts files to write in the archive (in apath order.) struct BackupWriter { band: Band, - index_builder: IndexBuilder, + index_builder: IndexWriter, stats: CopyStats, block_dir: BlockDir, buffer: Buffer, @@ -132,7 +132,7 @@ impl BackupWriter { /// Write out any pending data blocks, and then the pending index entries. fn flush_group(&mut self) -> Result<()> { - // TODO: Finish any compression groups. + // TODO: Finish FileCombiner, when this class has one. self.index_builder.finish_hunk() } diff --git a/src/band.rs b/src/band.rs index 4c14f137..255f26aa 100644 --- a/src/band.rs +++ b/src/band.rs @@ -181,8 +181,8 @@ impl Band { &self.band_id } - pub fn index_builder(&self) -> IndexBuilder { - IndexBuilder::new(self.transport.sub_transport(INDEX_DIR)) + pub fn index_builder(&self) -> IndexWriter { + IndexWriter::new(self.transport.sub_transport(INDEX_DIR)) } /// Get read-only access to the index of this band. diff --git a/src/index.rs b/src/index.rs index ff4610a4..c667a8a6 100644 --- a/src/index.rs +++ b/src/index.rs @@ -21,7 +21,7 @@ use std::vec; use crate::compress::snappy::{Compressor, Decompressor}; use crate::kind::Kind; -use crate::stats::{IndexBuilderStats, IndexReadStats}; +use crate::stats::{IndexReadStats, IndexWriterStats}; use crate::transport::local::LocalTransport; use crate::transport::Transport; use crate::unix_time::UnixTime; @@ -123,56 +123,65 @@ impl IndexEntry { } } -/// Accumulates ordered changes to the index and streams them out to index files. -pub struct IndexBuilder { +/// Write out index hunks. +/// +/// This class is responsible for: remembering the hunk number, and checking that the +/// hunks preserve apath order. +pub struct IndexWriter { /// The `i` directory within the band where all files for this index are written. transport: Box, - /// Currently queued entries to be written out. + /// Currently queued entries to be written out, in arbitrary order. entries: Vec, /// Index hunk number, starting at 0. sequence: u32, - /// The last-added filename, to enforce ordering. At the start of the first hunk - /// this is empty; at the start of a later hunk it's the last path from the previous - /// hunk, and otherwise it's the last path from `entries`. + /// The last filename from the previous hunk, to enforce ordering. At the + /// start of the first hunk this is empty; at the start of a later hunk it's + /// the last path from the previous hunk. check_order: apath::CheckOrder, /// Statistics about work done while writing this index. - pub stats: IndexBuilderStats, + pub stats: IndexWriterStats, compressor: Compressor, } /// Accumulate and write out index entries into files in an index directory. -impl IndexBuilder { +impl IndexWriter { /// Make a new builder that will write files into the given directory. - pub fn new(transport: Box) -> IndexBuilder { - IndexBuilder { + pub fn new(transport: Box) -> IndexWriter { + IndexWriter { transport, entries: Vec::::with_capacity(MAX_ENTRIES_PER_HUNK), sequence: 0, check_order: apath::CheckOrder::new(), - stats: IndexBuilderStats::default(), + stats: IndexWriterStats::default(), compressor: Compressor::new(), } } /// Finish the last hunk of this index, and return the stats. - pub fn finish(mut self) -> Result { + pub fn finish(mut self) -> Result { self.finish_hunk()?; Ok(self.stats) } - /// Append an entry to the index. + /// Write new index entries. + /// + /// Entries within one hunk may be added in arbitrary order, but they must all + /// sort after previously-written content. /// /// The new entry must sort after everything already written to the index. pub(crate) fn push_entry(&mut self, entry: IndexEntry) { - self.check_order.check(&entry.apath); self.entries.push(entry); } + pub(crate) fn append_entries(&mut self, entries: &[IndexEntry]) { + self.entries.extend_from_slice(entries); + } + /// Finish this hunk of the index. /// /// This writes all the currently queued entries into a new index file @@ -182,6 +191,14 @@ impl IndexBuilder { if self.entries.is_empty() { return Ok(()); } + self.entries.sort_by(|a, b| { + debug_assert!(a.apath != b.apath); + a.apath.cmp(&b.apath) + }); + self.check_order.check(&self.entries[0].apath); + if self.entries.len() > 1 { + self.check_order.check(&self.entries.last().unwrap().apath); + } let relpath = hunk_relpath(self.sequence); let write_error = |source| Error::WriteIndex { path: relpath.clone(), @@ -189,21 +206,19 @@ impl IndexBuilder { }; let json = serde_json::to_vec(&self.entries).map_err(|source| Error::SerializeIndex { source })?; - let uncompressed_len = json.len() as u64; if (self.sequence % HUNKS_PER_SUBDIR) == 0 { self.transport .create_dir(&subdir_relpath(self.sequence)) .map_err(write_error)?; } let compressed_bytes = self.compressor.compress(&json)?; - let compressed_len = compressed_bytes.len(); self.transport .write_file(&relpath, compressed_bytes) .map_err(write_error)?; self.stats.index_hunks += 1; - self.stats.compressed_index_bytes += compressed_len as u64; - self.stats.uncompressed_index_bytes += uncompressed_len as u64; + self.stats.compressed_index_bytes += compressed_bytes.len() as u64; + self.stats.uncompressed_index_bytes += json.len() as u64; self.entries.clear(); // Ready for the next hunk. self.sequence += 1; Ok(()) @@ -457,21 +472,21 @@ mod tests { use super::transport::local::LocalTransport; use super::*; - fn scratch_indexbuilder() -> (TempDir, IndexBuilder) { + fn setup() -> (TempDir, IndexWriter) { let testdir = TempDir::new().unwrap(); - let ib = IndexBuilder::new(Box::new(LocalTransport::new(testdir.path()))); + let ib = IndexWriter::new(Box::new(LocalTransport::new(testdir.path()))); (testdir, ib) } - fn add_an_entry(ib: &mut IndexBuilder, apath: &str) { - ib.push_entry(IndexEntry { + fn sample_entry(apath: &str) -> IndexEntry { + IndexEntry { apath: apath.into(), mtime: 1_461_736_377, mtime_nanos: 0, kind: Kind::File, addrs: vec![], target: None, - }) + } } #[test] @@ -494,41 +509,39 @@ mod tests { ); } + #[test] + fn index_builder_sorts_entries() { + let (_testdir, mut ib) = setup(); + ib.push_entry(sample_entry("/zzz")); + ib.push_entry(sample_entry("/aaa")); + ib.finish_hunk().unwrap(); + } + #[test] #[should_panic] - fn index_builder_checks_order() { - let (_testdir, mut ib) = scratch_indexbuilder(); - ib.push_entry(IndexEntry { - apath: "/zzz".into(), - mtime: 1_461_736_377, - mtime_nanos: 0, + fn index_builder_checks_names() { + let (_testdir, mut ib) = setup(); + ib.push_entry(sample_entry("../escapecat")); + ib.finish_hunk().unwrap(); + } - kind: Kind::File, - addrs: vec![], - target: None, - }); - ib.push_entry(IndexEntry { - apath: "aaa".into(), - mtime: 1_461_736_377, - mtime_nanos: 0, - kind: Kind::File, - addrs: vec![], - target: None, - }); + #[test] + #[should_panic] + fn no_duplicate_paths() { + let (_testdir, mut ib) = setup(); + ib.push_entry(sample_entry("/again")); + ib.push_entry(sample_entry("/again")); + ib.finish_hunk().unwrap(); } #[test] #[should_panic] - fn index_builder_checks_names() { - let (_testdir, mut ib) = scratch_indexbuilder(); - ib.push_entry(IndexEntry { - apath: "../escapecat".into(), - mtime: 1_461_736_377, - kind: Kind::File, - addrs: vec![], - mtime_nanos: 0, - target: None, - }); + fn no_duplicate_paths_across_hunks() { + let (_testdir, mut ib) = setup(); + ib.push_entry(sample_entry("/again")); + ib.finish_hunk().unwrap(); + ib.push_entry(sample_entry("/again")); + ib.finish_hunk().unwrap(); } #[test] @@ -538,11 +551,19 @@ mod tests { #[test] fn basic() { - let (testdir, mut ib) = scratch_indexbuilder(); - add_an_entry(&mut ib, "/apple"); - add_an_entry(&mut ib, "/banana"); - ib.finish_hunk().unwrap(); - drop(ib); + let (testdir, mut ib) = setup(); + ib.append_entries(&[sample_entry("/apple"), sample_entry("/banana")]); + let stats = ib.finish().unwrap(); + + assert_eq!(stats.index_hunks, 1); + assert!(stats.compressed_index_bytes > 30); + assert!( + stats.compressed_index_bytes < 70, + "expected shorter compressed index: {}", + stats.compressed_index_bytes + ); + assert!(stats.uncompressed_index_bytes > 100); + assert!(stats.uncompressed_index_bytes < 200); assert!( std::fs::metadata(testdir.path().join("00000").join("000000000")) @@ -563,13 +584,10 @@ mod tests { #[test] fn multiple_hunks() { - let (testdir, mut ib) = scratch_indexbuilder(); - add_an_entry(&mut ib, "/1.1"); - add_an_entry(&mut ib, "/1.2"); + let (testdir, mut ib) = setup(); + ib.append_entries(&[sample_entry("/1.1"), sample_entry("/1.2")]); ib.finish_hunk().unwrap(); - - add_an_entry(&mut ib, "/2.1"); - add_an_entry(&mut ib, "/2.2"); + ib.append_entries(&[sample_entry("/2.1"), sample_entry("/2.2")]); ib.finish_hunk().unwrap(); let index_read = IndexRead::open_path(&testdir.path()); @@ -598,13 +616,10 @@ mod tests { #[test] fn iter_hunks_advance_to_after() { - let (testdir, mut ib) = scratch_indexbuilder(); - add_an_entry(&mut ib, "/1.1"); - add_an_entry(&mut ib, "/1.2"); + let (testdir, mut ib) = setup(); + ib.append_entries(&[sample_entry("/1.1"), sample_entry("/1.2")]); ib.finish_hunk().unwrap(); - - add_an_entry(&mut ib, "/2.1"); - add_an_entry(&mut ib, "/2.2"); + ib.append_entries(&[sample_entry("/2.1"), sample_entry("/2.2")]); ib.finish_hunk().unwrap(); let index_read = IndexRead::open_path(&testdir.path()); @@ -681,38 +696,18 @@ mod tests { assert_eq!(names, [] as [&str; 0]); } - #[test] - #[should_panic] - fn no_duplicate_paths() { - let (_testdir, mut ib) = scratch_indexbuilder(); - add_an_entry(&mut ib, "/hello"); - add_an_entry(&mut ib, "/hello"); - } - - #[test] - #[should_panic] - fn no_duplicate_paths_across_hunks() { - let (_testdir, mut ib) = scratch_indexbuilder(); - add_an_entry(&mut ib, "/hello"); - ib.finish_hunk().unwrap(); - - // Try to add an identically-named file within the next hunk and it should error, - // because the IndexBuilder remembers the last file name written. - add_an_entry(&mut ib, "hello"); - } - #[test] fn advance() { - let (testdir, mut ib) = scratch_indexbuilder(); - add_an_entry(&mut ib, "/bar"); - add_an_entry(&mut ib, "/foo"); - add_an_entry(&mut ib, "/foobar"); + let (testdir, mut ib) = setup(); + ib.push_entry(sample_entry("/bar")); + ib.push_entry(sample_entry("/foo")); + ib.push_entry(sample_entry("/foobar")); ib.finish_hunk().unwrap(); // Make multiple hunks to test traversal across hunks. - add_an_entry(&mut ib, "/g01"); - add_an_entry(&mut ib, "/g02"); - add_an_entry(&mut ib, "/g03"); + ib.push_entry(sample_entry("/g01")); + ib.push_entry(sample_entry("/g02")); + ib.push_entry(sample_entry("/g03")); ib.finish_hunk().unwrap(); // Advance to /foo and read on from there. @@ -752,9 +747,9 @@ mod tests { /// https://github.com/sourcefrog/conserve/issues/95 #[test] fn no_final_empty_hunk() -> Result<()> { - let (testdir, mut ib) = scratch_indexbuilder(); + let (testdir, mut ib) = setup(); for i in 0..MAX_ENTRIES_PER_HUNK { - add_an_entry(&mut ib, &format!("/{:0>10}", i)); + ib.push_entry(sample_entry(&format!("/{:0>10}", i))); } ib.finish_hunk()?; // Think about, but don't actually add some files diff --git a/src/lib.rs b/src/lib.rs index 0ae921f4..b9ca85fc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -59,7 +59,7 @@ pub use crate::copy_tree::copy_tree; pub use crate::entry::Entry; pub use crate::errors::Error; pub use crate::gc_lock::GarbageCollectionLock; -pub use crate::index::{IndexBuilder, IndexEntry, IndexRead}; +pub use crate::index::{IndexEntry, IndexRead, IndexWriter}; pub use crate::kind::Kind; pub use crate::live_tree::{LiveEntry, LiveTree}; pub use crate::merge::{MergeTrees, MergedEntryKind}; diff --git a/src/stats.rs b/src/stats.rs index f5424317..17012b8d 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -94,7 +94,7 @@ pub struct IndexReadStats { } #[derive(Add, AddAssign, Clone, Debug, Default, Eq, PartialEq)] -pub struct IndexBuilderStats { +pub struct IndexWriterStats { pub index_hunks: u64, pub uncompressed_index_bytes: u64, pub compressed_index_bytes: u64, @@ -138,7 +138,7 @@ pub struct CopyStats { pub errors: usize, - pub index_builder_stats: IndexBuilderStats, + pub index_builder_stats: IndexWriterStats, // TODO: Include elapsed time. } From 2656da0b752f82d49c46ef3cb9229b765acdc390 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Thu, 29 Oct 2020 07:14:16 -0700 Subject: [PATCH 22/32] Clippy cleanups --- src/archive.rs | 2 +- src/bandid.rs | 2 +- src/bin/conserve.rs | 2 +- src/excludes.rs | 2 +- src/live_tree.rs | 2 +- tests/cli.rs | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/archive.rs b/src/archive.rs index ff336e93..8e7398e2 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -367,7 +367,7 @@ impl Archive { } stats }) - .reduce(|| ValidateStats::default(), |a, b| a + b); + .reduce(ValidateStats::default, |a, b| a + b); Ok(stats) } diff --git a/src/bandid.rs b/src/bandid.rs index ad297304..8929a02d 100644 --- a/src/bandid.rs +++ b/src/bandid.rs @@ -108,7 +108,7 @@ impl fmt::Display for BandId { /// but they can be longer. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut result = String::with_capacity(self.seqs.len() * 5); - result.push_str("b"); + result.push('b'); for s in &self.seqs { result.push_str(&format!("{:04}-", s)); } diff --git a/src/bin/conserve.rs b/src/bin/conserve.rs index 540d0220..669537b0 100644 --- a/src/bin/conserve.rs +++ b/src/bin/conserve.rs @@ -267,7 +267,7 @@ impl Command { output::show_tree_diff( &mut MergeTrees::new( st.iter_filtered(None, excludes.clone())?, - lt.iter_filtered(None, excludes.clone())?, + lt.iter_filtered(None, excludes)?, ), &mut stdout, )?; diff --git a/src/excludes.rs b/src/excludes.rs index e5a87c4c..29f61224 100644 --- a/src/excludes.rs +++ b/src/excludes.rs @@ -29,7 +29,7 @@ pub fn from_strings, S: AsRef>( if empty { return Ok(None); } - builder.build().map_err(Into::into).map(|gs| Some(gs)) + builder.build().map_err(Into::into).map(Some) } pub fn excludes_nothing() -> GlobSet { diff --git a/src/live_tree.rs b/src/live_tree.rs index 68185ae3..6f3768d3 100644 --- a/src/live_tree.rs +++ b/src/live_tree.rs @@ -171,7 +171,7 @@ impl Iter { /// Construct a new iter that will visit everything below this root path, /// subject to some exclusions fn new(root_path: &Path, subtree: Option, excludes: Option) -> Result { - let subtree = subtree.unwrap_or("/".into()); + let subtree = subtree.unwrap_or_else(|| "/".into()); let start_path = relative_path(root_path, &subtree); let start_metadata = fs::symlink_metadata(&start_path).map_err(Error::from)?; // Preload iter to return the root and then recurse into it. diff --git a/tests/cli.rs b/tests/cli.rs index d42b583f..4ab45228 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -173,7 +173,7 @@ both /subdir/subfile let is_expected_blocks = |output: &[u8]| { let output_str = std::str::from_utf8(&output).unwrap(); let mut blocks: Vec<&str> = output_str.lines().collect(); - blocks.sort(); + blocks.sort_unstable(); blocks == expected_blocks }; From dcd8c8cf0ef8368105c66177c6e314d149046b69 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Fri, 30 Oct 2020 07:36:24 -0700 Subject: [PATCH 23/32] Update TODOs --- src/backup.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index 41ba9cf1..93532338 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -77,7 +77,7 @@ pub fn backup(archive: &Archive, source: &LiveTree, options: &BackupOptions) -> writer.flush_group()?; } stats += writer.finish()?; - // TODO: Merge in stats from the tree iter and maybe the source tree? + // TODO: Merge in stats from the source tree? Ok(stats) } @@ -152,7 +152,6 @@ impl BackupWriter { } fn copy_dir(&mut self, source_entry: &E) -> Result<()> { - // TODO: Pass back index sizes self.stats.directories += 1; self.index_builder .push_entry(IndexEntry::metadata_from(source_entry)); @@ -356,6 +355,9 @@ impl FileCombiner { self.stats.empty_files += 1; self.finished.push(index_entry); } else { + // TODO: Check whether this file is exactly the same as, or a prefix of, + // one already stored inside this block. In that case trim the buffer and + // use the existing start/len. self.stats.small_combined_files += 1; self.queue.push(QueuedFile { start, From d66470979e7fb254e5c68ae21a998b14da48fca8 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Fri, 30 Oct 2020 07:41:06 -0700 Subject: [PATCH 24/32] Simplify --- src/backup.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index 93532338..c4e0ad7c 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -354,17 +354,17 @@ impl FileCombiner { if len == 0 { self.stats.empty_files += 1; self.finished.push(index_entry); - } else { - // TODO: Check whether this file is exactly the same as, or a prefix of, - // one already stored inside this block. In that case trim the buffer and - // use the existing start/len. - self.stats.small_combined_files += 1; - self.queue.push(QueuedFile { - start, - len, - entry: index_entry, - }); + return Ok(()); } + // TODO: Check whether this file is exactly the same as, or a prefix of, + // one already stored inside this block. In that case trim the buffer and + // use the existing start/len. + self.stats.small_combined_files += 1; + self.queue.push(QueuedFile { + start, + len, + entry: index_entry, + }); if self.buf.len() >= TARGET_COMBINED_BLOCK_SIZE { self.flush() } else { From 39facb686d1dc61d723d77699b676bb98ecfd744 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Fri, 30 Oct 2020 08:55:35 -0700 Subject: [PATCH 25/32] Actually combine small files --- src/backup.rs | 110 ++++++++++--------------------------------- src/index.rs | 14 +++--- src/live_tree.rs | 10 ++-- src/stats.rs | 26 +++++++++- src/test_fixtures.rs | 14 ++++++ tests/backup.rs | 83 ++++++++++++++++++++++++++++++-- tests/cli.rs | 3 +- tests/integration.rs | 2 +- 8 files changed, 156 insertions(+), 106 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index c4e0ad7c..cb6398f4 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -92,6 +92,8 @@ struct BackupWriter { /// The index for the last stored band, used as hints for whether newly /// stored files have changed. basis_index: Option, + + file_combiner: FileCombiner, } impl BackupWriter { @@ -118,6 +120,7 @@ impl BackupWriter { block_dir: archive.block_dir().clone(), stats: CopyStats::default(), basis_index, + file_combiner: FileCombiner::new(archive.block_dir().clone()), }) } @@ -133,10 +136,13 @@ impl BackupWriter { /// Write out any pending data blocks, and then the pending index entries. fn flush_group(&mut self) -> Result<()> { // TODO: Finish FileCombiner, when this class has one. + let (stats, mut entries) = self.file_combiner.drain()?; + self.stats += stats; + self.index_builder.append_entries(&mut entries); self.index_builder.finish_hunk() } - fn copy_entry(&mut self, entry: &R::Entry, source: &R) -> Result<()> { + fn copy_entry(&mut self, entry: &LiveEntry, source: &LiveTree) -> Result<()> { match entry.kind() { Kind::Dir => self.copy_dir(entry), Kind::File => self.copy_file(entry, source), @@ -159,7 +165,7 @@ impl BackupWriter { } /// Copy in the contents of a file from another tree. - fn copy_file(&mut self, source_entry: &R::Entry, from_tree: &R) -> Result<()> { + fn copy_file(&mut self, source_entry: &LiveEntry, from_tree: &LiveTree) -> Result<()> { self.stats.files += 1; let apath = source_entry.apath(); if let Some(basis_entry) = self @@ -186,10 +192,13 @@ impl BackupWriter { } else { self.stats.new_files += 1; } - let read_source = from_tree.file_contents(&source_entry); + let mut read_source = from_tree.file_contents(&source_entry)?; + if source_entry.size().expect("LiveEntry has a size") <= SMALL_FILE_CAP { + return self.file_combiner.push_file(source_entry, &mut read_source); + } let addrs = store_file_content( &apath, - &mut read_source?, + &mut read_source, &mut self.block_dir, &mut self.buffer, &mut self.stats, @@ -296,9 +305,16 @@ impl FileCombiner { } /// Flush any pending files, and return accumulated file entries and stats. - fn drain(mut self) -> Result<(CopyStats, Vec)> { + /// The FileCombiner is then empty and ready for reuse. + fn drain(&mut self) -> Result<(CopyStats, Vec)> { self.flush()?; - Ok((self.stats, self.finished)) + debug_assert!(self.queue.is_empty()); + debug_assert!(self.buf.is_empty()); + let stats = self.stats.clone(); + self.stats = CopyStats::default(); + let finished = self.finished.drain(..).collect(); + debug_assert!(self.finished.is_empty()); + Ok((stats, finished)) } /// Write all the content from the combined block to a blockdir. @@ -309,12 +325,13 @@ impl FileCombiner { /// block. fn flush(&mut self) -> Result<()> { if self.queue.is_empty() { - assert!(self.buf.is_empty()); + debug_assert!(self.buf.is_empty()); return Ok(()); } let hash = self .block_dir .store_or_deduplicate(&self.buf, &mut self.stats)?; + self.stats.combined_blocks += 1; self.buf.clear(); self.finished .extend(self.queue.drain(..).map(|qf| IndexEntry { @@ -340,6 +357,7 @@ impl FileCombiner { .unwrap(); let index_entry = IndexEntry::metadata_from(live_entry); if expected_len == 0 { + self.stats.empty_files += 1; self.finished.push(index_entry); return Ok(()); } @@ -382,7 +400,6 @@ mod tests { use tempfile::{NamedTempFile, TempDir}; use crate::stats::Sizes; - use crate::unix_time::UnixTime; use super::*; @@ -631,81 +648,4 @@ mod tests { assert_eq!(block_sizes.uncompressed, MAX_BLOCK_SIZE as u64); } } - - #[test] - fn combine_one_file() { - let testdir = TempDir::new().unwrap(); - let block_dir = BlockDir::create_path(testdir.path()).unwrap(); - let mut combiner = FileCombiner::new(block_dir); - let file_bytes = b"some stuff"; - let entry = LiveEntry { - apath: Apath::from("/0"), - kind: Kind::File, - mtime: UnixTime { - secs: 1603116230, - nanosecs: 0, - }, - symlink_target: None, - size: Some(file_bytes.len() as u64), - }; - let mut content = Cursor::new(file_bytes); - combiner.push_file(&entry, &mut content).unwrap(); - let (stats, entries) = combiner.drain().unwrap(); - assert_eq!(entries.len(), 1); - let addrs = entries[0].addrs.clone(); - assert_eq!(addrs.len(), 1, "combined file should have one block"); - assert_eq!(addrs[0].start, 0); - assert_eq!(addrs[0].len, 10); - let expected_entry = IndexEntry { - addrs, - ..IndexEntry::metadata_from(&entry) - }; - assert_eq!(entries[0], expected_entry); - assert_eq!(stats.uncompressed_bytes, 10); - assert_eq!(stats.written_blocks, 1); - } - - #[test] - fn combine_several_small_files() { - let testdir = TempDir::new().unwrap(); - let block_dir = BlockDir::create_path(testdir.path()).unwrap(); - let mut combiner = FileCombiner::new(block_dir); - let file_bytes = b"some stuff"; - let mut live_entry = LiveEntry { - apath: Apath::from("/0"), - kind: Kind::File, - mtime: UnixTime { - secs: 1603116230, - nanosecs: 0, - }, - symlink_target: None, - size: Some(file_bytes.len() as u64), - }; - for i in 0..10 { - live_entry.apath = Apath::from(format!("/{:02}", i)); - let mut content = Cursor::new(file_bytes); - combiner.push_file(&live_entry, &mut content).unwrap(); - } - let (stats, entries) = combiner.drain().unwrap(); - assert_eq!(entries.len(), 10); - - let first_hash = &entries[0].addrs[0].hash; - - for (i, entry) in entries.iter().enumerate() { - assert_eq!( - entry.addrs, - &[Address { - hash: first_hash.clone(), - start: i as u64 * 10, - len: file_bytes.len() as u64 - }] - ); - } - assert_eq!(stats.small_combined_files, 10); - assert_eq!(stats.empty_files, 0); - assert_eq!(stats.single_block_files, 0); - assert_eq!(stats.multi_block_files, 0); - assert_eq!(stats.uncompressed_bytes, 100); - assert_eq!(stats.written_blocks, 1); - } } diff --git a/src/index.rs b/src/index.rs index c667a8a6..c14beeef 100644 --- a/src/index.rs +++ b/src/index.rs @@ -178,8 +178,8 @@ impl IndexWriter { self.entries.push(entry); } - pub(crate) fn append_entries(&mut self, entries: &[IndexEntry]) { - self.entries.extend_from_slice(entries); + pub(crate) fn append_entries(&mut self, entries: &mut Vec) { + self.entries.append(entries); } /// Finish this hunk of the index. @@ -552,7 +552,7 @@ mod tests { #[test] fn basic() { let (testdir, mut ib) = setup(); - ib.append_entries(&[sample_entry("/apple"), sample_entry("/banana")]); + ib.append_entries(&mut vec![sample_entry("/apple"), sample_entry("/banana")]); let stats = ib.finish().unwrap(); assert_eq!(stats.index_hunks, 1); @@ -585,9 +585,9 @@ mod tests { #[test] fn multiple_hunks() { let (testdir, mut ib) = setup(); - ib.append_entries(&[sample_entry("/1.1"), sample_entry("/1.2")]); + ib.append_entries(&mut vec![sample_entry("/1.1"), sample_entry("/1.2")]); ib.finish_hunk().unwrap(); - ib.append_entries(&[sample_entry("/2.1"), sample_entry("/2.2")]); + ib.append_entries(&mut vec![sample_entry("/2.1"), sample_entry("/2.2")]); ib.finish_hunk().unwrap(); let index_read = IndexRead::open_path(&testdir.path()); @@ -617,9 +617,9 @@ mod tests { #[test] fn iter_hunks_advance_to_after() { let (testdir, mut ib) = setup(); - ib.append_entries(&[sample_entry("/1.1"), sample_entry("/1.2")]); + ib.append_entries(&mut vec![sample_entry("/1.1"), sample_entry("/1.2")]); ib.finish_hunk().unwrap(); - ib.append_entries(&[sample_entry("/2.1"), sample_entry("/2.2")]); + ib.append_entries(&mut vec![sample_entry("/2.1"), sample_entry("/2.2")]); ib.finish_hunk().unwrap(); let index_read = IndexRead::open_path(&testdir.path()); diff --git a/src/live_tree.rs b/src/live_tree.rs index 6f3768d3..49416ef8 100644 --- a/src/live_tree.rs +++ b/src/live_tree.rs @@ -48,11 +48,11 @@ impl LiveTree { /// An in-memory Entry describing a file/dir/symlink in a live tree. #[derive(Debug, Clone, Eq, PartialEq)] pub struct LiveEntry { - pub(crate) apath: Apath, - pub(crate) kind: Kind, - pub(crate) mtime: UnixTime, - pub(crate) size: Option, - pub(crate) symlink_target: Option, + apath: Apath, + kind: Kind, + mtime: UnixTime, + size: Option, + symlink_target: Option, } fn relative_path(root: &Path, apath: &Apath) -> PathBuf { diff --git a/src/stats.rs b/src/stats.rs index 17012b8d..312114e1 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -130,6 +130,8 @@ pub struct CopyStats { pub deduplicated_blocks: usize, pub written_blocks: usize, + /// Blocks containing combined small files. + pub combined_blocks: usize, pub empty_files: usize, pub small_combined_files: usize, @@ -142,6 +144,26 @@ pub struct CopyStats { // TODO: Include elapsed time. } +fn write_size>(w: &mut dyn io::Write, label: &str, value: I) { + writeln!( + w, + "{:>12} MB {}", + value.into().separate_with_commas(), + label + ) + .unwrap(); +} + +fn write_count>(w: &mut dyn io::Write, label: &str, value: I) { + writeln!( + w, + "{:>12} {}", + value.into().separate_with_commas(), + label + ) + .unwrap(); +} + impl CopyStats { pub fn summarize_restore(&self, _to_stream: &mut dyn io::Write) -> Result<()> { // format!( @@ -172,7 +194,7 @@ impl CopyStats { pub fn summarize_backup(&self, w: &mut dyn io::Write) { // TODO: Perhaps summarize to a string, or make this the Display impl. - writeln!(w, "{:>12} files:", self.files.separate_with_commas()).unwrap(); + write_count(w, "files:", self.files); writeln!( w, "{:>12} unmodified files", @@ -191,6 +213,7 @@ impl CopyStats { self.new_files.separate_with_commas() ) .unwrap(); + write_count(w, " small combined files", self.small_combined_files); writeln!( w, "{:>12} symlinks", @@ -224,6 +247,7 @@ impl CopyStats { self.written_blocks.separate_with_commas(), ) .unwrap(); + write_count(w, " blocks of combined files", self.combined_blocks); writeln!( w, "{:>12} MB uncompressed", diff --git a/src/test_fixtures.rs b/src/test_fixtures.rs index db960ec0..a0b93a3f 100644 --- a/src/test_fixtures.rs +++ b/src/test_fixtures.rs @@ -122,6 +122,20 @@ impl TreeFixture { full_path } + /// Create a file with a specified length. The first bytes of the file are the `prefix` and the remainder is zeros. + pub fn create_file_of_length_with_prefix( + &self, + relative_path: &str, + length: u64, + prefix: &[u8], + ) -> PathBuf { + let full_path = self.root.join(relative_path); + let mut f = fs::File::create(&full_path).unwrap(); + f.write_all(prefix).unwrap(); + f.set_len(length).expect("set file length"); + full_path + } + pub fn create_dir(&self, relative_path: &str) { fs::create_dir(self.root.join(relative_path)).unwrap(); } diff --git a/tests/backup.rs b/tests/backup.rs index f45120f7..16efbcd9 100644 --- a/tests/backup.rs +++ b/tests/backup.rs @@ -17,13 +17,84 @@ use conserve::test_fixtures::TreeFixture; use conserve::*; #[test] -pub fn many_files_multiple_hunks() { +fn small_files_combined_two_backups() { + let mut af = ScratchArchive::new(); + let srcdir = TreeFixture::new(); + srcdir.create_file("file1"); + srcdir.create_file("file2"); + + let stats1 = backup(&mut af, &srcdir.live_tree(), &BackupOptions::default()).unwrap(); + // Although the two files have the same content, we do not yet dedupe them + // within a combined block, so the block is different to when one identical + // file is stored alone. This could be fixed. + assert_eq!(stats1.combined_blocks, 1); + assert_eq!(stats1.new_files, 2); + assert_eq!(stats1.written_blocks, 1); + assert_eq!(stats1.new_files, 2); + + // Add one more file, also identical, but it is not combined with the previous blocks. + // This is a shortcoming of the current dedupe approach. + srcdir.create_file("file3"); + let stats2 = backup(&mut af, &srcdir.live_tree(), &BackupOptions::default()).unwrap(); + assert_eq!(stats2.new_files, 1); + assert_eq!(stats2.unmodified_files, 2); + assert_eq!(stats2.written_blocks, 1); + assert_eq!(stats2.combined_blocks, 1); + + assert_eq!(af.block_dir().block_names().unwrap().count(), 2); +} + +#[test] +fn many_small_files_combined_to_one_block() { let af = ScratchArchive::new(); let srcdir = TreeFixture::new(); // The directory also counts as an entry, so we should be able to fit 1999 // files in 2 hunks of 1000 entries. for i in 0..1999 { - srcdir.create_file(&format!("file{:04}", i)); + srcdir.create_file_of_length_with_prefix( + &format!("file{:04}", i), + 200, + format!("something about {}", i).as_bytes(), + ); + } + let stats = backup(&af, &srcdir.live_tree(), &BackupOptions::default()).expect("backup"); + assert_eq!( + stats.index_builder_stats.index_hunks, 2, + "expect exactly 2 hunks" + ); + assert_eq!(stats.files, 1999); + assert_eq!(stats.directories, 1); + assert_eq!(stats.unknown_kind, 0); + + assert_eq!(stats.new_files, 1999); + assert_eq!(stats.small_combined_files, 1999); + assert_eq!(stats.errors, 0); + // We write two combined blocks + assert_eq!(stats.written_blocks, 2); + assert_eq!(stats.combined_blocks, 2); + + let tree = af.open_stored_tree(BandSelectionPolicy::Latest).unwrap(); + let mut entry_iter = tree.iter_entries().unwrap(); + assert_eq!(entry_iter.next().unwrap().apath(), "/"); + for (i, entry) in entry_iter.enumerate() { + assert_eq!(entry.apath().to_string(), format!("/file{:04}", i)); + } + assert_eq!(tree.iter_entries().unwrap().count(), 2000); +} + +#[test] +pub fn mixed_medium_small_files_two_hunks() { + let af = ScratchArchive::new(); + let srcdir = TreeFixture::new(); + const MEDIUM_LENGTH: u64 = 150_000; + // Make some files large enough not to be grouped together as small files. + for i in 0..1999 { + let name = format!("file{:04}", i); + if i % 100 == 0 { + srcdir.create_file_of_length_with_prefix(&name, MEDIUM_LENGTH, b"something"); + } else { + srcdir.create_file(&name); + } } let stats = backup(&af, &srcdir.live_tree(), &BackupOptions::default()).expect("backup"); assert_eq!( @@ -35,10 +106,11 @@ pub fn many_files_multiple_hunks() { assert_eq!(stats.unknown_kind, 0); assert_eq!(stats.new_files, 1999); - assert_eq!(stats.single_block_files, 1999); + assert_eq!(stats.single_block_files, 20); + assert_eq!(stats.small_combined_files, 1999 - 20); assert_eq!(stats.errors, 0); - // They all have the same content. - assert_eq!(stats.written_blocks, 1); + // There's one deduped block for all the large files, and then one per hunk for all the small combined files. + assert_eq!(stats.written_blocks, 3); let tree = af.open_stored_tree(BandSelectionPolicy::Latest).unwrap(); let mut entry_iter = tree.iter_entries().unwrap(); @@ -46,4 +118,5 @@ pub fn many_files_multiple_hunks() { for (i, entry) in entry_iter.enumerate() { assert_eq!(entry.apath().to_string(), format!("/file{:04}", i)); } + assert_eq!(tree.iter_entries().unwrap().count(), 2000); } diff --git a/tests/cli.rs b/tests/cli.rs index 4ab45228..b2545b69 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -167,8 +167,7 @@ both /subdir/subfile .stdout("b0000\n"); let expected_blocks = [ - "1e99127adff52dec50072705c860e753b2d9c14c0e019bf9a258071699aac38db7d604b3e4ac5345d81ec7e3d8810a805a4e5ff3a44a9f7aa94d120220d2873a", - "fec91c70284c72d0d4e3684788a90de9338a5b2f47f01fedbe203cafd68708718ae5672d10eca804a8121904047d40d1d6cf11e7a76419357a9469af41f22d01", + "ea50e43840e5f310490bba1b641db82480a05e16e9ae220c1e5113c79b59541fa5a6ddb13db20d4df53dfcecb3ed9969e41a329e07afe0fbb597251a789c3575", ]; let is_expected_blocks = |output: &[u8]| { let output_str = std::str::from_utf8(&output).unwrap(); diff --git a/tests/integration.rs b/tests/integration.rs index da2b7fe2..fbb9705e 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -492,6 +492,6 @@ fn delete_bands() { .delete_bands(&[BandId::new(&[0]), BandId::new(&[1])], &Default::default()) .expect("delete_bands"); - assert_eq!(stats.deleted_block_count, 1); + assert_eq!(stats.deleted_block_count, 2); assert_eq!(stats.deleted_band_count, 2); } From 86a534a339eac8152b69265de9104a5ab0476804 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Fri, 30 Oct 2020 09:20:39 -0700 Subject: [PATCH 26/32] Simplify backup stats formatting --- src/backup.rs | 2 +- src/stats.rs | 145 ++++++++++---------------------------------------- 2 files changed, 28 insertions(+), 119 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index cb6398f4..2ac1a8f6 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -126,7 +126,7 @@ impl BackupWriter { fn finish(self) -> Result { let index_builder_stats = self.index_builder.finish()?; - self.band.close(index_builder_stats.index_hunks)?; + self.band.close(index_builder_stats.index_hunks as u64)?; Ok(CopyStats { index_builder_stats, ..self.stats diff --git a/src/stats.rs b/src/stats.rs index 312114e1..01674373 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -95,7 +95,7 @@ pub struct IndexReadStats { #[derive(Add, AddAssign, Clone, Debug, Default, Eq, PartialEq)] pub struct IndexWriterStats { - pub index_hunks: u64, + pub index_hunks: usize, pub uncompressed_index_bytes: u64, pub compressed_index_bytes: u64, } @@ -144,14 +144,17 @@ pub struct CopyStats { // TODO: Include elapsed time. } -fn write_size>(w: &mut dyn io::Write, label: &str, value: I) { - writeln!( +fn write_size>(w: &mut dyn io::Write, label: &str, value: I) { + writeln!(w, "{:>12} MB {}", mb_string(value.into()), label).unwrap(); +} + +fn write_compressed_size(w: &mut dyn io::Write, compressed: u64, uncompressed: u64) { + write_size(w, "uncompressed", uncompressed); + write_size( w, - "{:>12} MB {}", - value.into().separate_with_commas(), - label - ) - .unwrap(); + &format!("after {:.1}x compression", ratio(uncompressed, compressed)), + compressed, + ); } fn write_count>(w: &mut dyn io::Write, label: &str, value: I) { @@ -195,124 +198,30 @@ impl CopyStats { pub fn summarize_backup(&self, w: &mut dyn io::Write) { // TODO: Perhaps summarize to a string, or make this the Display impl. write_count(w, "files:", self.files); - writeln!( - w, - "{:>12} unmodified files", - self.unmodified_files.separate_with_commas() - ) - .unwrap(); - writeln!( - w, - "{:>12} modified files", - self.modified_files.separate_with_commas() - ) - .unwrap(); - writeln!( - w, - "{:>12} new files", - self.new_files.separate_with_commas() - ) - .unwrap(); + write_count(w, " unmodified files", self.unmodified_files); + write_count(w, " modified files", self.modified_files); + write_count(w, " new files", self.new_files); write_count(w, " small combined files", self.small_combined_files); - writeln!( - w, - "{:>12} symlinks", - self.symlinks.separate_with_commas() - ) - .unwrap(); - writeln!( - w, - "{:>12} directories", - self.directories.separate_with_commas() - ) - .unwrap(); - writeln!( - w, - "{:>12} special files skipped", - self.unknown_kind.separate_with_commas(), - ) - .unwrap(); + write_count(w, "symlinks", self.symlinks); + write_count(w, "directories", self.directories); + write_count(w, "unsupported file kind", self.unknown_kind); writeln!(w).unwrap(); - writeln!( - w, - "{:>12} deduplicated data blocks:", - self.deduplicated_blocks.separate_with_commas(), - ) - .unwrap(); - writeln!(w, "{:>12} MB saved", mb_string(self.deduplicated_bytes),).unwrap(); - writeln!( - w, - "{:>12} new data blocks:", - self.written_blocks.separate_with_commas(), - ) - .unwrap(); - write_count(w, " blocks of combined files", self.combined_blocks); - writeln!( - w, - "{:>12} MB uncompressed", - mb_string(self.uncompressed_bytes), - ) - .unwrap(); - writeln!( - w, - "{:>12} MB after {:.1}x compression", - mb_string(self.compressed_bytes), - ratio(self.uncompressed_bytes, self.compressed_bytes) - ) - .unwrap(); + write_count(w, "deduplicated data blocks:", self.deduplicated_blocks); + write_size(w, " saved", self.deduplicated_bytes); + writeln!(w).unwrap(); + write_count(w, "new data blocks:", self.deduplicated_blocks); + write_count(w, " blocks of combined files", self.combined_blocks); + write_compressed_size(w, self.compressed_bytes, self.uncompressed_bytes); writeln!(w).unwrap(); + let idx = &self.index_builder_stats; - writeln!( - w, - "{:>12} new index hunks:", - idx.index_hunks.separate_with_commas(), - ) - .unwrap(); - writeln!( - w, - "{:>12} MB uncompressed", - mb_string(idx.uncompressed_index_bytes), - ) - .unwrap(); - writeln!( - w, - "{:>12} MB after {:.1}x compression", - mb_string(idx.compressed_index_bytes), - ratio(idx.uncompressed_index_bytes, idx.compressed_index_bytes), - ) - .unwrap(); + write_count(w, "new index hunks", idx.index_hunks); + write_compressed_size(w, idx.compressed_index_bytes, idx.uncompressed_index_bytes); writeln!(w).unwrap(); - writeln!(w, "{:>12} errors", self.errors.separate_with_commas()).unwrap(); - // format!( - // "{:>12} MB in {} files, {} directories, {} symlinks.\n\ - // {:>12} files are unchanged.\n\ - // {:>12} MB/s input rate.\n\ - // {:>12} MB after deduplication.\n\ - // {:>12} MB in {} blocks after {:.1}x compression.\n\ - // {:>12} MB in {} index hunks after {:.1}x compression.\n\ - // {:>12} elapsed.\n", - // (self.get_size("file.bytes").uncompressed / M).separate_with_commas(), - // self.get_count("file").separate_with_commas(), - // self.get_count("dir").separate_with_commas(), - // self.get_count("symlink").separate_with_commas(), - // self.get_count("file.unchanged").separate_with_commas(), - // (mbps_rate( - // self.get_size("file.bytes").uncompressed, - // self.elapsed_time() - // ) as u64) - // .separate_with_commas(), - // (self.get_size("block").uncompressed / M).separate_with_commas(), - // (self.get_size("block").compressed / M).separate_with_commas(), - // self.get_count("block.write").separate_with_commas(), - // compression_ratio(&self.get_size("block")), - // (self.get_size("index").compressed / M).separate_with_commas(), - // self.get_count("index.hunk").separate_with_commas(), - // compression_ratio(&self.get_size("index")), - // duration_to_hms(self.elapsed_time()), - // ) + write_count(w, "errors", self.errors); } } From 3563cf5ea65c85d48d9680c9ed2cf8eb144edaaf Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 31 Oct 2020 17:19:40 -0700 Subject: [PATCH 27/32] Cope with short reads during backups Factor out code to read with retries --- src/backup.rs | 48 +++++++++++------------------------------------- src/io.rs | 21 +++++++++++++++++++++ 2 files changed, 32 insertions(+), 37 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index 2ac1a8f6..d6501b19 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -22,6 +22,7 @@ use itertools::Itertools; use crate::blockdir::Address; use crate::index::IndexEntryIter; +use crate::io::read_with_retries; use crate::stats::CopyStats; use crate::tree::ReadTree; use crate::*; @@ -87,7 +88,6 @@ struct BackupWriter { index_builder: IndexWriter, stats: CopyStats, block_dir: BlockDir, - buffer: Buffer, /// The index for the last stored band, used as hints for whether newly /// stored files have changed. @@ -116,7 +116,6 @@ impl BackupWriter { Ok(BackupWriter { band, index_builder, - buffer: Buffer::new(), block_dir: archive.block_dir().clone(), stats: CopyStats::default(), basis_index, @@ -200,7 +199,6 @@ impl BackupWriter { &apath, &mut read_source, &mut self.block_dir, - &mut self.buffer, &mut self.stats, )?; self.index_builder.push_entry(IndexEntry { @@ -220,41 +218,29 @@ impl BackupWriter { } } -/// A reusable block-sized buffer. -struct Buffer(Vec); - -impl Buffer { - fn new() -> Buffer { - Buffer(vec![0; MAX_BLOCK_SIZE]) - } -} - fn store_file_content( apath: &Apath, from_file: &mut dyn Read, block_dir: &mut BlockDir, - buffer: &mut Buffer, stats: &mut CopyStats, ) -> Result> { + let mut buffer = Vec::new(); let mut addresses = Vec::
::with_capacity(1); loop { - // TODO: Possibly read repeatedly in case we get a short read and have room for more, - // so that short reads don't lead to short blocks being stored. - let read_len = from_file - .read(&mut buffer.0) - .map_err(|source| Error::StoreFile { + read_with_retries(&mut buffer, MAX_BLOCK_SIZE, from_file).map_err(|source| { + Error::StoreFile { apath: apath.to_owned(), source, - })?; - if read_len == 0 { + } + })?; + if buffer.is_empty() { break; } - let block_data = &buffer.0[..read_len]; - let hash = block_dir.store_or_deduplicate(block_data, stats)?; + let hash = block_dir.store_or_deduplicate(buffer.as_slice(), stats)?; addresses.push(Address { hash, start: 0, - len: read_len as u64, + len: buffer.len() as u64, }); } match addresses.len() { @@ -434,7 +420,6 @@ mod tests { &Apath::from("/hello"), &mut example_file, &mut block_dir, - &mut Buffer::new(), &mut stats, ) .unwrap(); @@ -492,7 +477,6 @@ mod tests { &"/hello".into(), &mut Cursor::new(b"0123456789abcdef"), &mut block_dir, - &mut Buffer::new(), &mut CopyStats::default(), ) .unwrap(); @@ -523,7 +507,6 @@ mod tests { &"/hello".into(), &mut Cursor::new(b"0123456789abcdef"), &mut block_dir, - &mut Buffer::new(), &mut CopyStats::default(), ) .unwrap(); @@ -568,13 +551,11 @@ mod tests { let (_testdir, mut block_dir) = setup(); let mut example_file = make_example_file(); - let mut buffer = Buffer::new(); let mut stats = CopyStats::default(); let addrs1 = store_file_content( &Apath::from("/ello"), &mut example_file, &mut block_dir, - &mut buffer, &mut stats, ) .unwrap(); @@ -589,7 +570,6 @@ mod tests { &Apath::from("/ello2"), &mut example_file, &mut block_dir, - &mut buffer, &mut stats2, ) .unwrap(); @@ -620,14 +600,8 @@ mod tests { tf.seek(SeekFrom::Start(0)).unwrap(); let mut stats = CopyStats::default(); - let addrs = store_file_content( - &Apath::from("/big"), - &mut tf, - &mut block_dir, - &mut Buffer::new(), - &mut stats, - ) - .unwrap(); + let addrs = + store_file_content(&Apath::from("/big"), &mut tf, &mut block_dir, &mut stats).unwrap(); // Only one block needs to get compressed. The others are deduplicated. assert_eq!(stats.uncompressed_bytes, MAX_BLOCK_SIZE as u64); diff --git a/src/io.rs b/src/io.rs index 561094d9..48ac7e32 100644 --- a/src/io.rs +++ b/src/io.rs @@ -15,6 +15,7 @@ use std::fs; use std::io; +use std::io::prelude::*; use std::path::Path; pub(crate) fn ensure_dir_exists(path: &Path) -> std::io::Result<()> { @@ -31,3 +32,23 @@ pub(crate) fn ensure_dir_exists(path: &Path) -> std::io::Result<()> { pub(crate) fn directory_is_empty(path: &Path) -> std::io::Result { Ok(std::fs::read_dir(path)?.next().is_none()) } + +/// Read up to `len` bytes into a buffer, and resize the vec to the bytes read. +pub(crate) fn read_with_retries( + buf: &mut Vec, + len: usize, + from_file: &mut dyn Read, +) -> std::io::Result<()> { + // TODO: This could safely resize the buf without initializing, since it will be overwritten. + buf.resize(len, 0); + let mut bytes_read = 0; + while bytes_read < len { + let read_len = from_file.read(&mut buf[bytes_read..])?; + if read_len == 0 { + break; + } + bytes_read += read_len; + } + buf.truncate(bytes_read); + Ok(()) +} From 9627d37dd14604fd57a26d65b1d695410de03dd3 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 31 Oct 2020 17:33:43 -0700 Subject: [PATCH 28/32] Directly store empty files --- src/backup.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/backup.rs b/src/backup.rs index d6501b19..2b252a53 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -176,12 +176,6 @@ impl BackupWriter { if source_entry.is_unchanged_from(&basis_entry) { // TODO: In verbose mode, say if the file is changed, unchanged, // etc, but without duplicating the filenames. - // - // ui::println(&format!("unchanged file {}", apath)); - - // We can reasonably assume that the existing archive complies - // with the archive invariants, which include that all the - // blocks referenced by the index, are actually present. self.stats.unmodified_files += 1; self.index_builder.push_entry(basis_entry); return Ok(()); @@ -192,7 +186,14 @@ impl BackupWriter { self.stats.new_files += 1; } let mut read_source = from_tree.file_contents(&source_entry)?; - if source_entry.size().expect("LiveEntry has a size") <= SMALL_FILE_CAP { + let size = source_entry.size().expect("LiveEntry has a size"); + if size == 0 { + self.index_builder + .push_entry(IndexEntry::metadata_from(source_entry)); + self.stats.empty_files += 1; + return Ok(()); + } + if size <= SMALL_FILE_CAP { return self.file_combiner.push_file(source_entry, &mut read_source); } let addrs = store_file_content( From cd99fed402740f20d898e1d4ea5496c716917569 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 31 Oct 2020 18:28:39 -0700 Subject: [PATCH 29/32] Use stitched basis Fixes #142 Refactor stitching iterator --- src/archive.rs | 2 +- src/backup.rs | 29 +++++++++++++---------- src/band.rs | 2 +- src/bin/conserve.rs | 1 + src/index.rs | 49 ++++++++++++++++++-------------------- src/lib.rs | 1 + src/output.rs | 2 +- src/stitch.rs | 5 ++++ src/test_fixtures.rs | 4 ++++ tests/backup.rs | 56 ++++++++++++++++++++++++++++++++++++++++++++ tests/integration.rs | 5 ++-- 11 files changed, 113 insertions(+), 43 deletions(-) diff --git a/src/archive.rs b/src/archive.rs index 8e7398e2..f26683b8 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -237,7 +237,7 @@ impl Archive { .enumerate() .inspect(move |(i, _)| progress_bar.set_fraction(*i, num_bands)) .map(move |(_i, band_id)| Band::open(&archive, &band_id).expect("Failed to open band")) - .flat_map(|band| band.iter_entries().expect("Failed to iter entries")) + .flat_map(|band| band.iter_entries()) .flat_map(|entry| entry.addrs) .map(|addr| addr.hash)) } diff --git a/src/backup.rs b/src/backup.rs index 2b252a53..6bb01271 100644 --- a/src/backup.rs +++ b/src/backup.rs @@ -21,20 +21,31 @@ use globset::GlobSet; use itertools::Itertools; use crate::blockdir::Address; -use crate::index::IndexEntryIter; use crate::io::read_with_retries; use crate::stats::CopyStats; use crate::tree::ReadTree; use crate::*; /// Configuration of how to make a backup. -#[derive(Debug, Default)] +#[derive(Debug)] pub struct BackupOptions { /// Print filenames to the UI as they're copied. pub print_filenames: bool, /// Exclude these globs from the backup. pub excludes: Option, + + pub max_entries_per_hunk: usize, +} + +impl Default for BackupOptions { + fn default() -> BackupOptions { + BackupOptions { + print_filenames: false, + excludes: None, + max_entries_per_hunk: crate::index::MAX_ENTRIES_PER_HUNK, + } + } } // This causes us to walk the source tree twice, which is probably an acceptable option @@ -59,10 +70,7 @@ pub fn backup(archive: &Archive, source: &LiveTree, options: &BackupOptions) -> progress_bar.set_phase("Copying".to_owned()); let entry_iter = source.iter_filtered(None, options.excludes.clone())?; - for entry_group in entry_iter - .chunks(crate::index::MAX_ENTRIES_PER_HUNK) - .into_iter() - { + for entry_group in entry_iter.chunks(options.max_entries_per_hunk).into_iter() { for entry in entry_group { if options.print_filenames { crate::ui::println(entry.apath()); @@ -91,7 +99,7 @@ struct BackupWriter { /// The index for the last stored band, used as hints for whether newly /// stored files have changed. - basis_index: Option, + basis_index: Option>, file_combiner: FileCombiner, } @@ -104,12 +112,9 @@ impl BackupWriter { if gc_lock::GarbageCollectionLock::is_locked(archive)? { return Err(Error::GarbageCollectionLockHeld); } - // TODO: Use a stitched band as the basis. - // let basis_index = archive - .last_complete_band()? - .map(|b| b.iter_entries()) - .transpose()?; + .last_band_id()? + .map(|band_id| archive.iter_stitched_index_hunks(&band_id).iter_entries()); // Create the new band only after finding the basis band! let band = Band::create(archive)?; let index_builder = band.index_builder(); diff --git a/src/band.rs b/src/band.rs index 255f26aa..e4706a0a 100644 --- a/src/band.rs +++ b/src/band.rs @@ -191,7 +191,7 @@ impl Band { } /// Return an iterator through entries in this band. - pub fn iter_entries(&self) -> Result { + pub fn iter_entries(&self) -> index::IndexEntryIter { self.index().iter_entries() } diff --git a/src/bin/conserve.rs b/src/bin/conserve.rs index 669537b0..ca68bf6f 100644 --- a/src/bin/conserve.rs +++ b/src/bin/conserve.rs @@ -208,6 +208,7 @@ impl Command { let options = BackupOptions { print_filenames: *verbose, excludes, + ..Default::default() }; let copy_stats = backup(&Archive::open_path(archive)?, &source, &options)?; ui::println("Backup complete."); diff --git a/src/index.rs b/src/index.rs index c14beeef..c1dfe624 100644 --- a/src/index.rs +++ b/src/index.rs @@ -275,11 +275,8 @@ impl IndexRead { } /// Make an iterator that will return all entries in this band. - pub fn iter_entries(&self) -> Result { - Ok(IndexEntryIter { - buffered_entries: Vec::::new().into_iter().peekable(), - hunk_iter: self.iter_hunks(), - }) + pub fn iter_entries(self) -> IndexEntryIter { + IndexEntryIter::new(self.iter_hunks()) } /// Make an iterator that returns hunks of entries from this index. @@ -395,14 +392,23 @@ impl IndexHunkIter { } /// Read out all the entries from a stored index, in apath order. -pub struct IndexEntryIter { +pub struct IndexEntryIter>> { /// Temporarily buffered entries, read from the index files but not yet /// returned to the client. buffered_entries: Peekable>, - hunk_iter: IndexHunkIter, + hunk_iter: HI, +} + +impl>> IndexEntryIter { + pub(crate) fn new(hunk_iter: HI) -> Self { + IndexEntryIter { + buffered_entries: Vec::::new().into_iter().peekable(), + hunk_iter, + } + } } -impl Iterator for IndexEntryIter { +impl>> Iterator for IndexEntryIter { type Item = IndexEntry; fn next(&mut self) -> Option { @@ -417,7 +423,7 @@ impl Iterator for IndexEntryIter { } } -impl IndexEntryIter { +impl>> IndexEntryIter { /// Return the entry for given apath, if it is present, otherwise None. /// It follows this will also return None at the end of the index. /// @@ -572,9 +578,7 @@ mod tests { "Index hunk file not found" ); - let mut it = IndexRead::open_path(&testdir.path()) - .iter_entries() - .unwrap(); + let mut it = IndexRead::open_path(&testdir.path()).iter_entries(); let entry = it.next().expect("Get first entry"); assert_eq!(&entry.apath, "/apple"); let entry = it.next().expect("Get second entry"); @@ -591,12 +595,13 @@ mod tests { ib.finish_hunk().unwrap(); let index_read = IndexRead::open_path(&testdir.path()); - let it = index_read.iter_entries().unwrap(); + let it = index_read.iter_entries(); let names: Vec = it.map(|x| x.apath.into()).collect(); assert_eq!(names, &["/1.1", "/1.2", "/2.1", "/2.2"]); // Read it out as hunks. - let hunks: Vec> = index_read.iter_hunks().collect(); + let hunks: Vec> = + IndexRead::open_path(&testdir.path()).iter_hunks().collect(); assert_eq!(hunks.len(), 2); assert_eq!( hunks[0] @@ -711,33 +716,25 @@ mod tests { ib.finish_hunk().unwrap(); // Advance to /foo and read on from there. - let mut it = IndexRead::open_path(&testdir.path()) - .iter_entries() - .unwrap(); + let mut it = IndexRead::open_path(&testdir.path()).iter_entries(); assert_eq!(it.advance_to(&Apath::from("/foo")).unwrap().apath, "/foo"); assert_eq!(it.next().unwrap().apath, "/foobar"); assert_eq!(it.next().unwrap().apath, "/g01"); // Advance to before /g01 - let mut it = IndexRead::open_path(&testdir.path()) - .iter_entries() - .unwrap(); + let mut it = IndexRead::open_path(&testdir.path()).iter_entries(); assert_eq!(it.advance_to(&Apath::from("/fxxx")), None); assert_eq!(it.next().unwrap().apath, "/g01"); assert_eq!(it.next().unwrap().apath, "/g02"); // Advance to before the first entry - let mut it = IndexRead::open_path(&testdir.path()) - .iter_entries() - .unwrap(); + let mut it = IndexRead::open_path(&testdir.path()).iter_entries(); assert_eq!(it.advance_to(&Apath::from("/aaaa")), None); assert_eq!(it.next().unwrap().apath, "/bar"); assert_eq!(it.next().unwrap().apath, "/foo"); // Advance to after the last entry - let mut it = IndexRead::open_path(&testdir.path()) - .iter_entries() - .unwrap(); + let mut it = IndexRead::open_path(&testdir.path()).iter_entries(); assert_eq!(it.advance_to(&Apath::from("/zz")), None); assert_eq!(it.next(), None); } diff --git a/src/lib.rs b/src/lib.rs index b9ca85fc..56f0309b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -68,6 +68,7 @@ pub use crate::progress::ProgressBar; pub use crate::restore::{RestoreOptions, RestoreTree}; pub use crate::stats::{CopyStats, DeleteStats, ValidateStats}; pub use crate::stored_tree::StoredTree; +pub use crate::transport::Transport; pub use crate::tree::{ReadBlocks, ReadTree, TreeSize, WriteTree}; // Commonly-used external types. diff --git a/src/output.rs b/src/output.rs index 43477bbb..a2a804a5 100644 --- a/src/output.rs +++ b/src/output.rs @@ -89,7 +89,7 @@ pub fn show_verbose_version_list( pub fn show_index_json(band: &Band, w: &mut dyn Write) -> Result<()> { // TODO: Maybe use https://docs.serde.rs/serde/ser/trait.Serializer.html#method.collect_seq. let bw = BufWriter::new(w); - let index_entries: Vec = band.iter_entries()?.collect(); + let index_entries: Vec = band.iter_entries().collect(); serde_json::ser::to_writer_pretty(bw, &index_entries) .map_err(|source| Error::SerializeIndex { source }) } diff --git a/src/stitch.rs b/src/stitch.rs index 337be17f..80d1576d 100644 --- a/src/stitch.rs +++ b/src/stitch.rs @@ -28,6 +28,7 @@ //! seen. //! * Bands might be deleted, so their numbers are not contiguous. +use crate::index::IndexEntryIter; use crate::*; pub struct IterStitchedIndexHunks { @@ -52,6 +53,10 @@ impl IterStitchedIndexHunks { index_hunks: None, } } + + pub fn iter_entries(self) -> IndexEntryIter { + IndexEntryIter::new(self) + } } impl Iterator for IterStitchedIndexHunks { diff --git a/src/test_fixtures.rs b/src/test_fixtures.rs index a0b93a3f..0f2abf6a 100644 --- a/src/test_fixtures.rs +++ b/src/test_fixtures.rs @@ -70,6 +70,10 @@ impl ScratchArchive { srcdir.create_file("hello2"); backup(&self.archive, &srcdir.live_tree(), &options).unwrap(); } + + pub fn transport(&self) -> &dyn Transport { + self.archive.transport() + } } impl Deref for ScratchArchive { diff --git a/tests/backup.rs b/tests/backup.rs index 16efbcd9..bd244e43 100644 --- a/tests/backup.rs +++ b/tests/backup.rs @@ -120,3 +120,59 @@ pub fn mixed_medium_small_files_two_hunks() { } assert_eq!(tree.iter_entries().unwrap().count(), 2000); } + +#[test] +fn detect_unchanged_from_stitched_index() { + let af = ScratchArchive::new(); + let srcdir = TreeFixture::new(); + srcdir.create_file("a"); + srcdir.create_file("b"); + // Use small hunks for easier manipulation. + let stats = backup( + &af, + &srcdir.live_tree(), + &BackupOptions { + max_entries_per_hunk: 1, + ..Default::default() + }, + ) + .unwrap(); + assert_eq!(stats.new_files, 2); + assert_eq!(stats.small_combined_files, 2); + assert_eq!(stats.index_builder_stats.index_hunks, 3); + + // Make a second backup, with the first file changed. + srcdir.create_file_with_contents("a", b"new a contents"); + let stats = backup( + &af, + &srcdir.live_tree(), + &BackupOptions { + max_entries_per_hunk: 1, + ..Default::default() + }, + ) + .unwrap(); + assert_eq!(stats.unmodified_files, 1); + assert_eq!(stats.modified_files, 1); + assert_eq!(stats.index_builder_stats.index_hunks, 3); + + // Delete the last hunk and reopen the last band. + af.transport().remove_file("b0001/BANDTAIL").unwrap(); + af.transport() + .remove_file("b0001/i/00000/000000002") + .unwrap(); + + // The third backup should see nothing changed, by looking at the stitched + // index from both b0 and b1. + let stats = backup( + &af, + &srcdir.live_tree(), + &BackupOptions { + max_entries_per_hunk: 1, + ..Default::default() + }, + ) + .unwrap(); + assert_eq!(stats.unmodified_files, 2, "both files are unmodified"); + assert_eq!(stats.index_builder_stats.index_hunks, 3); +} diff --git a/tests/integration.rs b/tests/integration.rs index fbb9705e..721b7c66 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -123,6 +123,7 @@ pub fn backup_more_excludes() { let options = BackupOptions { excludes, print_filenames: false, + ..Default::default() }; let stats = backup(&af, &source, &options).expect("backup"); @@ -146,7 +147,7 @@ fn check_backup(af: &ScratchArchive) { let band = Band::open(&af, &band_ids[0]).unwrap(); assert!(band.is_closed().unwrap()); - let index_entries = band.iter_entries().unwrap().collect::>(); + let index_entries = band.iter_entries().collect::>(); assert_eq!(2, index_entries.len()); let root_entry = &index_entries[0]; @@ -275,7 +276,7 @@ pub fn symlink() { let band = Band::open(&af, &band_ids[0]).unwrap(); assert!(band.is_closed().unwrap()); - let index_entries = band.iter_entries().unwrap().collect::>(); + let index_entries = band.iter_entries().collect::>(); assert_eq!(2, index_entries.len()); let e2 = &index_entries[1]; From 4434dbe462863a88336fdbbe2836f21e6e3ddcfc Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 31 Oct 2020 18:36:42 -0700 Subject: [PATCH 30/32] News for combining and stitched basis. --- NEWS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/NEWS.md b/NEWS.md index 39d83e51..d1d28bdc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,12 @@ - Reading a subtree of a local source is faster, because it no longer reads and discards the whole tree. +- Small files (under 100kB) are combined into blocks, to allow better cross-file + compression and to produce fewer small file reads and writes. + +- Backups use a stitched index as the basis. As a result, Conserve is better + able to recognize unchanged files after an interrupted backup. + ## v0.6.8 2020-10-16 ### Features From bdfd5158ae4f8fd99e8b7c3a3a57a77d816bcfa7 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 31 Oct 2020 18:42:43 -0700 Subject: [PATCH 31/32] Add old_archive test for 0.6.9 --- testdata/archive/v0.6.9/minimal-1/CONSERVE | 1 + testdata/archive/v0.6.9/minimal-1/b0000/BANDHEAD | 1 + testdata/archive/v0.6.9/minimal-1/b0000/BANDTAIL | 1 + .../v0.6.9/minimal-1/b0000/i/00000/000000000 | Bin 0 -> 326 bytes ...3dfcecb3ed9969e41a329e07afe0fbb597251a789c3575 | 2 ++ tests/old_archives.rs | 2 +- 6 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 testdata/archive/v0.6.9/minimal-1/CONSERVE create mode 100644 testdata/archive/v0.6.9/minimal-1/b0000/BANDHEAD create mode 100644 testdata/archive/v0.6.9/minimal-1/b0000/BANDTAIL create mode 100644 testdata/archive/v0.6.9/minimal-1/b0000/i/00000/000000000 create mode 100644 testdata/archive/v0.6.9/minimal-1/d/ea5/ea50e43840e5f310490bba1b641db82480a05e16e9ae220c1e5113c79b59541fa5a6ddb13db20d4df53dfcecb3ed9969e41a329e07afe0fbb597251a789c3575 diff --git a/testdata/archive/v0.6.9/minimal-1/CONSERVE b/testdata/archive/v0.6.9/minimal-1/CONSERVE new file mode 100644 index 00000000..24d40a2d --- /dev/null +++ b/testdata/archive/v0.6.9/minimal-1/CONSERVE @@ -0,0 +1 @@ +{"conserve_archive_version":"0.6"} diff --git a/testdata/archive/v0.6.9/minimal-1/b0000/BANDHEAD b/testdata/archive/v0.6.9/minimal-1/b0000/BANDHEAD new file mode 100644 index 00000000..412eb149 --- /dev/null +++ b/testdata/archive/v0.6.9/minimal-1/b0000/BANDHEAD @@ -0,0 +1 @@ +{"start_time":1604194804,"band_format_version":"0.6.3"} diff --git a/testdata/archive/v0.6.9/minimal-1/b0000/BANDTAIL b/testdata/archive/v0.6.9/minimal-1/b0000/BANDTAIL new file mode 100644 index 00000000..bf3500d3 --- /dev/null +++ b/testdata/archive/v0.6.9/minimal-1/b0000/BANDTAIL @@ -0,0 +1 @@ +{"end_time":1604194804,"index_hunk_count":1} diff --git a/testdata/archive/v0.6.9/minimal-1/b0000/i/00000/000000000 b/testdata/archive/v0.6.9/minimal-1/b0000/i/00000/000000000 new file mode 100644 index 0000000000000000000000000000000000000000..638ced81f836516c6794298908c4d7ec0c7b84eb GIT binary patch literal 326 zcmYjMy-EW?5WWM;1Ole9usN_$0?zF2?A@k{AVD;Rpx8)|+1ojCc)3V&R+1;M7kmM0 zpTHNg@dXsuPMG5RhWUQxd*}W3DdPw37TGvE&PLgN-G&HH>vbf{u3kzuMhl7vK|z1_ z?5^c@wTVkr5I|ADcCR-Q2 b(qWE7%KS{yI5(4Yu<3Z+ohbO{vYr0|VBTBO literal 0 HcmV?d00001 diff --git a/testdata/archive/v0.6.9/minimal-1/d/ea5/ea50e43840e5f310490bba1b641db82480a05e16e9ae220c1e5113c79b59541fa5a6ddb13db20d4df53dfcecb3ed9969e41a329e07afe0fbb597251a789c3575 b/testdata/archive/v0.6.9/minimal-1/d/ea5/ea50e43840e5f310490bba1b641db82480a05e16e9ae220c1e5113c79b59541fa5a6ddb13db20d4df53dfcecb3ed9969e41a329e07afe0fbb597251a789c3575 new file mode 100644 index 00000000..1a2b118f --- /dev/null +++ b/testdata/archive/v0.6.9/minimal-1/d/ea5/ea50e43840e5f310490bba1b641db82480a05e16e9ae220c1e5113c79b59541fa5a6ddb13db20d4df53dfcecb3ed9969e41a329e07afe0fbb597251a789c3575 @@ -0,0 +1,2 @@ +\hello world +I like Rust diff --git a/tests/old_archives.rs b/tests/old_archives.rs index 086d144a..ecbd37ef 100644 --- a/tests/old_archives.rs +++ b/tests/old_archives.rs @@ -23,7 +23,7 @@ use predicates::prelude::*; use conserve::*; -const ARCHIVE_VERSIONS: &[&str] = &["0.6.0", "0.6.2", "0.6.3"]; +const ARCHIVE_VERSIONS: &[&str] = &["0.6.0", "0.6.2", "0.6.3", "0.6.9"]; fn open_old_archive(ver: &str, name: &str) -> Archive { Archive::open_path(&Path::new(&format!("testdata/archive/v{}/{}/", ver, name))) From 0d78bd97ef79cbc34fca6b9d2aaa1c9821d5f628 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 31 Oct 2020 18:51:28 -0700 Subject: [PATCH 32/32] Better file counts in backup stats --- src/stats.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/stats.rs b/src/stats.rs index 01674373..b7ec421b 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -201,12 +201,17 @@ impl CopyStats { write_count(w, " unmodified files", self.unmodified_files); write_count(w, " modified files", self.modified_files); write_count(w, " new files", self.new_files); - write_count(w, " small combined files", self.small_combined_files); write_count(w, "symlinks", self.symlinks); write_count(w, "directories", self.directories); write_count(w, "unsupported file kind", self.unknown_kind); writeln!(w).unwrap(); + write_count(w, "empty files", self.empty_files); + write_count(w, "small combined files", self.small_combined_files); + write_count(w, "single block files", self.single_block_files); + write_count(w, "multi-block files", self.multi_block_files); + writeln!(w).unwrap(); + write_count(w, "deduplicated data blocks:", self.deduplicated_blocks); write_size(w, " saved", self.deduplicated_bytes); writeln!(w).unwrap();