feat: golden-model dedup command

0xCCF4 · Nov 30, 2024 · b5f1b34 · b5f1b34
1 parent 00be903
commit b5f1b34
Show file tree

Hide file tree

Showing 10 changed files with 289 additions and 71 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -34,6 +34,7 @@ tar = {  version = "0.4.40", optional = true }
 zip = { git = "https://github.com/0xCCF4/zip2", branch = "feature-read-from-seekable-stream", optional = true, features = ["chrono"] }
 chrono = { version = "0.4.38", optional = true }
 rand = "0.8.5"
+regex = "1.10.5"
 
 [features]
 hash-sha1 = ["dep:sha1"]

diff --git a/src/data/path.rs b/src/data/path.rs
@@ -144,6 +144,22 @@ impl FilePath {
         result
     }
 
+    /// Gets the first component of the file path.
+    ///
+    /// # Returns
+    /// The first component of the file path. None if the file path is empty.
+    pub fn first_component(&self) -> Option<&PathBuf> {
+        self.path.first()
+    }
+
+    /// Gets the first component of the file path.
+    ///
+    /// # Returns
+    /// The first component of the file path. None if the file path is empty.
+    pub fn first_component_mut(&mut self) -> Option<&mut PathBuf> {
+        self.path.first_mut()
+    }
+
     /// Gets the last component of the file path.
     ///
     /// # Returns

diff --git a/src/lib.md b/src/lib.md
@@ -12,19 +12,19 @@ The tool is run in four stages:
 │  Folder   ├──┘ └────┬┬────┘  └┬────────────────┘                 
 │   -file   │         ││        │                                  
 │   -file   │ ┌───────┼┼────────┘                                  
-└───┬────┬──┘ │       ││                                           
-    │    │    │  ┌────▼▼────┐  ┌─────────────────┐                 
-    │    │    │  │          │  │                 │                 
-    │    │    └──► Analyze  ├──► Duplicate Sets  │                 
-    │    │       │          │  │                 │                 
-    │    │       └────┬┬────┘  └┬────────────────┘                 
-    │    │            ││        │      Basic functionality complete
-----│----│----┌───────┼┼────────┘----------------------------------
-    │    │    │       ││                 Implementation in progress
-    │    │    │  ┌────▼▼────┐  ┌─────────────────┐                 
-    │    │    └──►          │  │                 │                 
-    │    │       │  Dedup   ├──► Change commands │                 
-    │    └───────►          │  │                 │                 
+└───┬───────┘ │       ││                                           
+    │         │  ┌────▼▼────┐  ┌─────────────────┐                 
+    │         │  │          │  │                 │                 
+    │         └──► Analyze  ├──► Duplicate Sets  │                 
+    │            │          │  │                 │                 
+    │            └────┬┬────┘  └┬────────────────┘                 
+    │                 ││        │                                  
+    │         ┌───────┼┼────────┘                                  
+    │         │       ││                                           
+    │         │  ┌────▼▼────┐  ┌─────────────────┐                 
+    │         │  |          │  │                 │                 
+    │         └──►  Dedup   ├──► Change commands │                 
+    │            |          │  │                 │                 
     │            └────┬┬────┘  └┬────────────────┘                 
     │                 ││        │                                  
     │         ┌───────┼┼────────┘                                  
@@ -38,11 +38,8 @@ The tool is run in four stages:
 1. **Build**: The tools reads a folder and builds a hash tree of all files in it.
 2. **Analyze**: The tool analyzes the hash tree and finds duplicate files.
 3. **Dedup**: The tool determine which steps to take to deduplicate the files.
-This can be done in a half automatic or manual way.
 4. **Execute**: The tool executes the deduplication steps (Deleting/Hardlinking/...).
 
-**Dedup** and **Execute** are in development and currently not (fully) implemented.
-
 ## Build
 * Input: Folder with files, Hashtree (optional) to update or continue from.
 * Output: HashTree
@@ -79,28 +76,36 @@ The `clean` command can also be run manually.
   single-threaded duplication detection.
 
 ### Analysis results
-The analysis results are stored in a file with the following format:
+The analysis results are stored in a file with the following JSON format:
 ```plain
-[ENTRY] [newline]
-[ENTRY] [newline]
-...
+{
+  "version": "V1",
+  "entries": [
+      ENTRY,
+      ENTRY,
+      ...
+  ]
+}
 ```
-See `ResultEntry` for the exact format of an entry. In short, it contains (JSON)
+
+
+See `DupSetEntry` for the exact format of an entry. In short, it contains (JSON)
 * File type
 * Hash
-* Size (0 if it is a directory, else the file size of one of the files)
+* Size (if it is a directory: number of children, else the file size of one of the files)
 * Conflicting Set (a set of all files that are duplicates of each other)
 
 ## Dedup
 * Input: Duplicate sets
-* Output: Set of commands to execute to deduplicate the files
-* Execution: Manual or half-automatic, user interaction required.
+* Output: Required actions to deduplicate the files
+* Execution: Fully automatic, no user interaction required.
 
-Implementation in progress. To the current date the duplicate sets
-must be manually processed.
+Currently, there is just one deduplication strategy implemented: 
+* **golden model**: delete all files outside of the "golden" directory that are also contained
+withing the golden directory.
 
 ## Execute
-* Input: Set of commands
+* Input: Set of dedup actions
 * Output: Deduplicated files
 * Execution: Fully automatic, user interaction only on errors.
 

diff --git a/src/main.rs b/src/main.rs
@@ -2,14 +2,16 @@ use backup_deduplicator::hash::GeneralHashType;
 use backup_deduplicator::stages::analyze::cmd::AnalysisSettings;
 use backup_deduplicator::stages::build::cmd::BuildSettings;
 use backup_deduplicator::stages::clean::cmd::CleanSettings;
+use backup_deduplicator::stages::dedup::golden_model::cmd::{
+    DedupGoldenModelSettings, MatchingModel,
+};
 use backup_deduplicator::stages::{analyze, build, clean, dedup};
 use backup_deduplicator::utils;
 use clap::{arg, Parser, Subcommand};
 use log::{debug, info, trace, LevelFilter};
 use std::env;
 use std::path::PathBuf;
 use std::str::FromStr;
-use backup_deduplicator::stages::dedup::golden_model::cmd::DedupGoldenModelSettings;
 
 /// A simple command line tool to deduplicate backups.
 #[derive(Parser, Debug)]
@@ -105,13 +107,13 @@ enum Command {
         /// The output actions file to write the actions to.
         #[arg(short, long, default_value = "actions.bdc")]
         output: String,
-        /// Overwrite the output file, if set it already exists 
+        /// Overwrite the output file, if set it already exists
         #[arg(long = "overwrite", default_value = "false")]
         overwrite: bool,
         /// Deduplication mode and settings
         #[command(subcommand)]
         mode: DedupMode,
-    }
+    },
 }
 
 #[derive(Subcommand, Debug)]
@@ -120,17 +122,20 @@ enum DedupMode {
     /// Files from within the reference model are not altered. A list of other directories
     /// can be given; from within those directories all files that have a duplicate in the reference model
     /// are marked for deletion.
-    /// 
+    ///
     /// This mode is useful if having multiple backups of the same data. If you would like to quickly
     /// remove files from older backups that are also present in the newer one.
     GoldenModel {
         /// The reference model directory
         #[arg(short, long)]
         reference_model: String,
+        /// The matching model to use for deduplication.
+        #[arg(short, long, default_value = "plain")]
+        matching_model: MatchingModel,
         /// The directories to delete files from.
         #[arg(short, long)]
         directories: Vec<String>,
-    }
+    },
 }
 
 fn main() {
@@ -191,10 +196,15 @@ fn main() {
 
             // Convert to paths and check if they exist
 
-            let directory = directory.into_iter().map(|directory| utils::main::parse_path(
-                directory.as_str(),
-                utils::main::ParsePathKind::AbsoluteNonExisting,
-            )).collect::<Vec<PathBuf>>();
+            let directory = directory
+                .into_iter()
+                .map(|directory| {
+                    utils::main::parse_path(
+                        directory.as_str(),
+                        utils::main::ParsePathKind::AbsoluteNonExisting,
+                    )
+                })
+                .collect::<Vec<PathBuf>>();
             let output = utils::main::parse_path(
                 output.as_str(),
                 utils::main::ParsePathKind::AbsoluteNonExisting,
@@ -410,11 +420,16 @@ fn main() {
             }
 
             match mode {
-                DedupMode::GoldenModel { reference_model, directories } => {
+                DedupMode::GoldenModel {
+                    reference_model,
+                    matching_model,
+                    directories,
+                } => {
                     match dedup::golden_model::cmd::run(DedupGoldenModelSettings {
                         input,
                         output,
                         reference_model,
+                        matching_model,
                         directories,
                     }) {
                         Ok(_) => {

diff --git a/src/stages/analyze/cmd.rs b/src/stages/analyze/cmd.rs
@@ -185,31 +185,31 @@ pub fn run(analysis_settings: AnalysisSettings) -> Result<()> {
         "There are {} GB of duplicated files",
         duplicated_bytes / 1024 / 1024 / 1024
     );
-    
+
     drop(output_buf_writer);
-    
+
     let output_file_reader = match input_file_options.open(&analysis_settings.output) {
         Ok(file) => file,
         Err(err) => {
             return Err(anyhow!("Failed to open output file readable: {}", err));
         }
     };
-    
+
     let mut output_buf_reader = std::io::BufReader::new(&output_file_reader);
     let mut text = String::new();
-    
+
     if let Err(err) = output_buf_reader.read_to_string(&mut text) {
         return Err(anyhow!("Failed to read output file: {}", err));
     }
-    
+
     drop(output_buf_reader);
-    
+
     let mut result = DupSetFile {
         version: DupSetFileVersion::V1,
         entries: Vec::new(),
     };
     for line in text.lines() {
-        let entry = match serde_json::from_str(&line) {
+        let entry = match serde_json::from_str(line) {
             Ok(entry) => entry,
             Err(err) => {
                 error!("Failed to parse line in output: {}", err);
@@ -226,11 +226,11 @@ pub fn run(analysis_settings: AnalysisSettings) -> Result<()> {
         }
     };
     let mut output_buf_writer = std::io::BufWriter::new(&output_file);
-    
+
     if let Err(err) = serde_json::to_writer(&mut output_buf_writer, &result) {
         return Err(anyhow!("Failed to write output file: {}", err));
     }
-    
+
     if let Err(err) = output_buf_writer.flush() {
         return Err(anyhow!("Failed to flush output file: {}", err));
     }

diff --git a/src/stages/analyze/output/dupset_file.rs b/src/stages/analyze/output/dupset_file.rs
@@ -47,7 +47,12 @@ impl From<&DupSetEntryRef<'_, '_, '_>> for DupSetEntry {
             ftype: *entry.ftype,
             size: entry.size,
             hash: entry.hash.clone(),
-            conflicting: entry.conflicting.clone().into_iter().cloned().collect::<Vec<FilePath>>(),
+            conflicting: entry
+                .conflicting
+                .clone()
+                .into_iter()
+                .cloned()
+                .collect::<Vec<FilePath>>(),
         }
     }
 }
@@ -60,32 +65,40 @@ pub enum DupSetFileVersion {
 }
 
 /// Deduplication set file.
-/// 
+///
 /// # Fields
 /// * `version` - The version of the file format.
 /// * `entries` - The deduplication set entries.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct DupSetFile {
+    /// The version of the file format.
     pub version: DupSetFileVersion,
+    /// The deduplication set entries.
     pub entries: Vec<DupSetEntry>,
 }
 
 /// Deduplication set file. (Reference version)
-/// 
+///
 /// # Fields
 /// * `version` - The version of the file format.
 /// * `entries` - The deduplication set entries.
 #[derive(Debug, Serialize)]
 pub struct DupSetFileRef<'a, 'b, 'c> {
+    /// The version of the file format.
     pub version: DupSetFileVersion,
+    /// The deduplication set entries.
     pub entries: Vec<DupSetEntryRef<'a, 'b, 'c>>,
 }
 
 impl From<&DupSetFileRef<'_, '_, '_>> for DupSetFile {
     fn from(value: &DupSetFileRef<'_, '_, '_>) -> Self {
         DupSetFile {
             version: value.version,
-            entries: value.entries.iter().map(DupSetEntry::from).collect::<Vec<DupSetEntry>>(),
+            entries: value
+                .entries
+                .iter()
+                .map(DupSetEntry::from)
+                .collect::<Vec<DupSetEntry>>(),
         }
     }
-}
+}
diff --git a/src/stages/dedup.rs b/src/stages/dedup.rs
@@ -1,4 +1,3 @@
-
 /// Contains the output data structures for the dedup stage.
 pub mod output {
     mod actions;