wip: added ability to only run quality metrics

nishaq503 · Jan 23, 2025 · d8aada9 · d8aada9
1 parent e3dd7b0
commit d8aada9
Show file tree

Hide file tree

Showing 16 changed files with 491 additions and 581 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,10 +6,10 @@ members = [
     "crates/results/cakes",
     "crates/results/chaoda",
     "crates/results/rite-solutions",
-    "crates/results/msa",
     "pypi/distances",
     "benches/utils",
     "benches/cakes",
+    "benches/musals",
 ]
 resolver = "2"
 
@@ -40,6 +40,8 @@ clap = { version = "4.5", features = ["derive"] }
 ftlog = { version = "0.2" }
 # For reading and writing HDF5 files.
 hdf5 = { package = "hdf5-metno", version = "0.9.0" }
+# For some MSA schenanigans.
+bytecount = "0.6.8"
 
 # For Python Wrappers
 numpy = "0.23"

diff --git a/benches/cakes/src/README.md → benches/cakes/README.md b/benches/cakes/src/README.md → benches/cakes/README.md
diff --git a/benches/cakes/src/main.rs b/benches/cakes/src/main.rs
@@ -8,13 +8,12 @@
     clippy::perf,
     clippy::pedantic,
     clippy::nursery,
-    clippy::missing_docs_in_private_items,
     clippy::unwrap_used,
     clippy::expect_used,
     clippy::panic,
     clippy::cast_lossless
 )]
-//! Benchmarks for the CAKES paper.
+#![doc = include_str!("../README.md")]
 
 use std::path::PathBuf;
 

diff --git a/benches/musals/Cargo.toml b/benches/musals/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "bench-musals"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+clap = { version = "4.5.16", features = ["derive"] }
+bench-utils = { path = "../utils" }
+ftlog = { workspace = true }
+abd-clam = { workspace = true, features = ["disk-io"] }
+distances = { workspace = true }
+rand = { workspace = true }
+rayon = { workspace = true }
diff --git a/benches/musals/README.md b/benches/musals/README.md
@@ -0,0 +1,34 @@
+# Benchmarks for CLAM-MUSALS
+
+This is crate provides a CLI to run benchmarks for the MUSALS algorithm for building multiple sequence alignments (MSAs) on biological sequences.
+
+## Usage
+
+Run the following command to see the usage information:
+
+```shell
+cargo run -rp bench-musals -- --help
+```
+
+If you want to build the MSA on all a sequences in a fasta file, you can use the following command:
+
+```shell
+cargo run -rp bench-musals -- \
+    -i ../data/string-data/greengenes/gg_13_5.fasta \
+    -o ../data/string-data/greengenes/musals-results \
+    -m extended-iupac
+```
+
+If you want to build the MSA on a subset of the sequences in a fasta file, you can use use the optional `-n` flag to specify the number of sequences to use:
+
+```shell
+cargo run -rp bench-musals -- \
+    -i ../data/string-data/greengenes/gg_13_5.fasta \
+    -o ../data/string-data/greengenes/musals-results \
+    -m extended-iupac \
+    -n 1000
+```
+
+## Citation
+
+TODO...
diff --git a/benches/musals/src/main.rs b/benches/musals/src/main.rs
@@ -0,0 +1,241 @@
+#![deny(clippy::correctness)]
+#![warn(
+    missing_docs,
+    clippy::all,
+    clippy::suspicious,
+    clippy::style,
+    clippy::complexity,
+    clippy::perf,
+    clippy::pedantic,
+    clippy::nursery,
+    clippy::unwrap_used,
+    clippy::expect_used,
+    clippy::panic,
+    clippy::cast_lossless
+)]
+#![doc = include_str!("../README.md")]
+
+use std::path::PathBuf;
+
+use abd_clam::{metric::Levenshtein, msa, Cluster, Dataset};
+use bench_utils::configure_logger;
+use clap::Parser;
+use distances::Number;
+
+mod path_manager;
+mod steps;
+
+/// Reproducible results for the MSA paper.
+#[derive(Parser, Debug)]
+#[command(version, about, long_about = None)]
+struct Args {
+    /// Path to the input fasta file.
+    #[arg(short('i'), long)]
+    inp_path: PathBuf,
+
+    /// The number of samples to take from the input dataset. If not provided,
+    /// the entire dataset is used.
+    #[arg(short('n'), long)]
+    num_samples: Option<usize>,
+
+    /// The cost matrix to use for the alignment.
+    #[arg(short('m'), long)]
+    cost_matrix: CostMatrix,
+
+    /// Optional cost of opening a gap, if using an affine gap penalty. If not
+    /// provided, a flat gap penalty is used.
+    #[arg(short('g'), long)]
+    gap_open: Option<usize>,
+
+    /// Whether to build a balanced tree.
+    #[arg(short('b'), long)]
+    balanced: bool,
+
+    /// Path to the output directory.
+    #[arg(short('o'), long)]
+    out_dir: Option<PathBuf>,
+
+    /// Whether to only compute the quality metrics.
+    #[arg(short('q'), long)]
+    quality_only: bool,
+}
+
+/// The cost matrix to use for the alignment.
+#[derive(clap::ValueEnum, Debug, Clone)]
+#[allow(non_camel_case_types, clippy::doc_markdown)]
+#[non_exhaustive]
+pub enum CostMatrix {
+    /// The default matrix. All substitutions have a cost of 1.
+    #[clap(name = "default")]
+    Default,
+    /// Default but with affine gap penalties. Gap opening is 10 and ext is 1.
+    #[clap(name = "default-affine")]
+    DefaultAffine,
+    /// Extended IUPAC matrix.
+    #[clap(name = "extended-iupac")]
+    ExtendedIupac,
+    /// Blosum62 matrix.
+    #[clap(name = "blosum62")]
+    Blosum62,
+}
+
+impl CostMatrix {
+    /// Get the cost matrix.
+    #[must_use]
+    pub fn cost_matrix<T: Number + core::ops::Neg<Output = T>>(&self, gap_open: Option<usize>) -> msa::CostMatrix<T> {
+        match self {
+            Self::Default => msa::CostMatrix::default(),
+            Self::DefaultAffine => msa::CostMatrix::default_affine(gap_open),
+            Self::ExtendedIupac => msa::CostMatrix::extended_iupac(gap_open),
+            Self::Blosum62 => msa::CostMatrix::blosum62(gap_open),
+        }
+    }
+}
+
+#[allow(clippy::similar_names, clippy::cognitive_complexity, clippy::too_many_lines)]
+fn main() -> Result<(), String> {
+    let args = Args::parse();
+    println!("{args:?}");
+
+    let fasta_name = args
+        .inp_path
+        .file_stem()
+        .ok_or("No file name found")?
+        .to_string_lossy()
+        .to_string();
+    let log_name = format!("musals-{fasta_name}");
+    // We need the `_guard` in scope to ensure proper logging.
+    let (_guard, log_path) = configure_logger(&log_name)?;
+    println!("Log file: {log_path:?}");
+    ftlog::info!("{args:?}");
+
+    let cost_matrix = args.cost_matrix.cost_matrix::<i32>(args.gap_open);
+    let aligner = msa::Aligner::new(&cost_matrix, b'-');
+
+    let out_dir = if let Some(out_dir) = args.out_dir {
+        if !out_dir.exists() {
+            std::fs::create_dir(&out_dir).map_err(|e| e.to_string())?;
+        }
+        out_dir
+    } else {
+        args.inp_path.parent().ok_or("No parent directory found")?.to_path_buf()
+    };
+    ftlog::info!("Input file: {:?}", args.inp_path);
+    ftlog::info!("Output directory: {out_dir:?}");
+
+    let (data, _) = bench_utils::fasta::read(&args.inp_path, 0, true)?;
+    let msa_data = if args.quality_only {
+        steps::read_aligned(&args.inp_path, &aligner)?
+    } else {
+        let data = if let Some(num_samples) = args.num_samples {
+            ftlog::info!("Sub-sampling dataset to {} samples.", num_samples);
+            let data = data.random_subsample(&mut rand::thread_rng(), num_samples);
+            let seq_lens = data.items().iter().map(String::len).collect::<Vec<_>>();
+            let (min_len, max_len, _, _, _) = bench_utils::fasta::len_stats(&seq_lens);
+            let name = format!("{}-{num_samples}", data.name());
+            let data = data
+                .with_dim_lower_bound(min_len)
+                .with_dim_upper_bound(max_len)
+                .with_name(&name);
+            ftlog::info!("Sub-sampled dataset: length range = {:?}.", data.dimensionality_hint());
+            data
+        } else {
+            data
+        };
+        let str_to_seq = |s: String| msa::Sequence::new(s, Some(&aligner));
+        let data = data.transform_items(str_to_seq);
+        ftlog::info!(
+            "Finished reading original dataset: length range = {:?}",
+            data.dimensionality_hint()
+        );
+
+        let path_manager = path_manager::PathManager::new(data.name(), &out_dir);
+
+        let metric = Levenshtein;
+
+        let msa_fasta_path = path_manager.msa_fasta_path();
+        if !msa_fasta_path.exists() {
+            // Build the MSA.
+            let start = std::time::Instant::now();
+            let msa_ball_path = path_manager.msa_ball_path();
+            let msa_data_path = path_manager.msa_data_path();
+
+            let (perm_ball, perm_data) = if msa_ball_path.exists() && msa_data_path.exists() {
+                // Read the PermutedBall and the dataset.
+                steps::read_permuted_ball(&msa_ball_path, &msa_data_path, &aligner)?
+            } else {
+                let ball_path = path_manager.ball_path();
+                let ball = if ball_path.exists() {
+                    // Read the Ball.
+                    steps::read_ball(&ball_path)?
+                } else {
+                    steps::build_ball(&data, &metric, &ball_path, &path_manager.ball_csv_path(), args.balanced)?
+                };
+                ftlog::info!("Ball has {} leaves.", ball.leaves().len());
+
+                // Build the PermutedBall and the dataset.
+                steps::build_perm_ball(ball, data, &metric, &msa_ball_path, &msa_data_path)?
+            };
+            ftlog::info!("Permuted Ball has {} leaves.", perm_ball.leaves().len());
+
+            // Build the MSA.
+            steps::build_aligned(
+                &args.cost_matrix,
+                args.gap_open,
+                &perm_ball,
+                &perm_data,
+                &msa_fasta_path,
+            )?;
+            let elapsed = start.elapsed().as_secs_f32();
+            let msa_build_msg = format!("Finished building MSA in {elapsed:.2} seconds.");
+            ftlog::info!("{msa_build_msg}");
+            println!("{msa_build_msg}");
+        };
+
+        // Read the aligned sequences and load the data.
+        ftlog::info!("Reading aligned sequences from: {msa_fasta_path:?}");
+        steps::read_aligned(&msa_fasta_path, &aligner)?
+    };
+
+    ftlog::info!(
+        "Finished reading {} aligned sequences with width = {:?}.",
+        msa_data.cardinality(),
+        msa_data.dimensionality_hint()
+    );
+
+    // Compute the quality metrics.
+    let start = std::time::Instant::now();
+    let gap_penalty = 1;
+    let mismatch_penalty = 1;
+    let gap_open_penalty = 10;
+    let gap_ext_penalty = 1;
+
+    let pg_quality = msa_data.percent_gaps();
+    ftlog::info!("Percent gaps: {pg_quality:.4}");
+
+    let ps_quality = msa_data.par_scoring_pairwise_subsample(gap_penalty, mismatch_penalty);
+    ftlog::info!("Pairwise scoring metric estimate: {ps_quality}");
+
+    let wps_quality =
+        msa_data.par_weighted_scoring_pairwise_subsample(gap_open_penalty, gap_ext_penalty, mismatch_penalty);
+    ftlog::info!("Weighted pairwise scoring metric estimate: {wps_quality}");
+
+    let (avg_p, max_p) = msa_data.par_p_distance_stats_subsample();
+    ftlog::info!("Pairwise distance stats estimate: avg = {avg_p:.4}, max = {max_p:.4}");
+
+    let dd_quality = msa_data.par_distance_distortion_subsample();
+    ftlog::info!("Distance distortion metric estimate: {dd_quality}");
+
+    ftlog::info!("Converting to column-major format.");
+    let col_msa_data = msa_data.par_change_major();
+    ftlog::info!("Finished converting to column-major format.");
+
+    let cs_quality = col_msa_data.par_scoring_columns(gap_penalty, mismatch_penalty);
+    ftlog::info!("Column scoring metric: {cs_quality}");
+
+    let elapsed = start.elapsed().as_secs_f32();
+    ftlog::info!("Quality metrics took {elapsed:.2} seconds.");
+    println!("Quality metrics took {elapsed:.2} seconds.");
+
+    Ok(())
+}
diff --git a/benches/musals/src/path_manager.rs b/benches/musals/src/path_manager.rs
@@ -0,0 +1,67 @@
+//! A helper for managing the names of files on disk.
+
+use std::path::{Path, PathBuf};
+
+/// A helper for managing the names of files on disk.
+pub struct PathManager {
+    name: String,
+    out_dir: PathBuf,
+}
+
+impl PathManager {
+    /// Creates a new `PathManager`.
+    #[must_use]
+    pub fn new<P: AsRef<Path>>(name: &str, out_dir: &P) -> Self {
+        Self {
+            name: name.to_string(),
+            out_dir: out_dir.as_ref().to_path_buf(),
+        }
+    }
+
+    /// The name of the dataset.
+    #[must_use]
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+
+    /// The directory where the dataset is stored.
+    #[must_use]
+    pub fn out_dir(&self) -> &Path {
+        &self.out_dir
+    }
+
+    /// The path to the binary file containing the `Ball` tree.
+    #[must_use]
+    pub fn ball_path(&self) -> PathBuf {
+        let name = format!("{}.ball", self.name());
+        self.out_dir().join(name)
+    }
+
+    /// The path to the CSV file containing the `Ball` tree.
+    #[must_use]
+    pub fn ball_csv_path(&self) -> PathBuf {
+        let name = format!("{}_ball.csv", self.name());
+        self.out_dir().join(name)
+    }
+
+    /// Path to file containing tree used toe making an MSA.
+    #[must_use]
+    pub fn msa_ball_path(&self) -> PathBuf {
+        let name = format!("{}_ball.msa", self.name());
+        self.out_dir().join(name)
+    }
+
+    /// Path to file containing containing the MSA of the dataset.
+    #[must_use]
+    pub fn msa_data_path(&self) -> PathBuf {
+        let name = format!("{}.msa", self.name());
+        self.out_dir().join(name)
+    }
+
+    /// Path to file containing the MSA of the dataset in FASTA format.
+    #[must_use]
+    pub fn msa_fasta_path(&self) -> PathBuf {
+        let name = format!("{}_msa.fasta", self.name());
+        self.out_dir().join(name)
+    }
+}