Skip to content

Commit

Permalink
wip: added ability to only run quality metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
nishaq503 committed Jan 23, 2025
1 parent e3dd7b0 commit d8aada9
Show file tree
Hide file tree
Showing 16 changed files with 491 additions and 581 deletions.
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ members = [
"crates/results/cakes",
"crates/results/chaoda",
"crates/results/rite-solutions",
"crates/results/msa",
"pypi/distances",
"benches/utils",
"benches/cakes",
"benches/musals",
]
resolver = "2"

Expand Down Expand Up @@ -40,6 +40,8 @@ clap = { version = "4.5", features = ["derive"] }
ftlog = { version = "0.2" }
# For reading and writing HDF5 files.
hdf5 = { package = "hdf5-metno", version = "0.9.0" }
# For some MSA schenanigans.
bytecount = "0.6.8"

# For Python Wrappers
numpy = "0.23"
Expand Down
File renamed without changes.
3 changes: 1 addition & 2 deletions benches/cakes/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,12 @@
clippy::perf,
clippy::pedantic,
clippy::nursery,
clippy::missing_docs_in_private_items,
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
clippy::cast_lossless
)]
//! Benchmarks for the CAKES paper.
#![doc = include_str!("../README.md")]

use std::path::PathBuf;

Expand Down
13 changes: 13 additions & 0 deletions benches/musals/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[package]
name = "bench-musals"
version = "0.1.0"
edition = "2021"

[dependencies]
clap = { version = "4.5.16", features = ["derive"] }
bench-utils = { path = "../utils" }
ftlog = { workspace = true }
abd-clam = { workspace = true, features = ["disk-io"] }
distances = { workspace = true }
rand = { workspace = true }
rayon = { workspace = true }
34 changes: 34 additions & 0 deletions benches/musals/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Benchmarks for CLAM-MUSALS

This is crate provides a CLI to run benchmarks for the MUSALS algorithm for building multiple sequence alignments (MSAs) on biological sequences.

## Usage

Run the following command to see the usage information:

```shell
cargo run -rp bench-musals -- --help
```

If you want to build the MSA on all a sequences in a fasta file, you can use the following command:

```shell
cargo run -rp bench-musals -- \
-i ../data/string-data/greengenes/gg_13_5.fasta \
-o ../data/string-data/greengenes/musals-results \
-m extended-iupac
```

If you want to build the MSA on a subset of the sequences in a fasta file, you can use use the optional `-n` flag to specify the number of sequences to use:

```shell
cargo run -rp bench-musals -- \
-i ../data/string-data/greengenes/gg_13_5.fasta \
-o ../data/string-data/greengenes/musals-results \
-m extended-iupac \
-n 1000
```

## Citation

TODO...
241 changes: 241 additions & 0 deletions benches/musals/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
#![deny(clippy::correctness)]
#![warn(
missing_docs,
clippy::all,
clippy::suspicious,
clippy::style,
clippy::complexity,
clippy::perf,
clippy::pedantic,
clippy::nursery,
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
clippy::cast_lossless
)]
#![doc = include_str!("../README.md")]

use std::path::PathBuf;

use abd_clam::{metric::Levenshtein, msa, Cluster, Dataset};
use bench_utils::configure_logger;
use clap::Parser;
use distances::Number;

mod path_manager;
mod steps;

/// Reproducible results for the MSA paper.
#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
struct Args {
/// Path to the input fasta file.
#[arg(short('i'), long)]
inp_path: PathBuf,

/// The number of samples to take from the input dataset. If not provided,
/// the entire dataset is used.
#[arg(short('n'), long)]
num_samples: Option<usize>,

/// The cost matrix to use for the alignment.
#[arg(short('m'), long)]
cost_matrix: CostMatrix,

/// Optional cost of opening a gap, if using an affine gap penalty. If not
/// provided, a flat gap penalty is used.
#[arg(short('g'), long)]
gap_open: Option<usize>,

/// Whether to build a balanced tree.
#[arg(short('b'), long)]
balanced: bool,

/// Path to the output directory.
#[arg(short('o'), long)]
out_dir: Option<PathBuf>,

/// Whether to only compute the quality metrics.
#[arg(short('q'), long)]
quality_only: bool,
}

/// The cost matrix to use for the alignment.
#[derive(clap::ValueEnum, Debug, Clone)]
#[allow(non_camel_case_types, clippy::doc_markdown)]
#[non_exhaustive]
pub enum CostMatrix {
/// The default matrix. All substitutions have a cost of 1.
#[clap(name = "default")]
Default,
/// Default but with affine gap penalties. Gap opening is 10 and ext is 1.
#[clap(name = "default-affine")]
DefaultAffine,
/// Extended IUPAC matrix.
#[clap(name = "extended-iupac")]
ExtendedIupac,
/// Blosum62 matrix.
#[clap(name = "blosum62")]
Blosum62,
}

impl CostMatrix {
/// Get the cost matrix.
#[must_use]
pub fn cost_matrix<T: Number + core::ops::Neg<Output = T>>(&self, gap_open: Option<usize>) -> msa::CostMatrix<T> {
match self {
Self::Default => msa::CostMatrix::default(),
Self::DefaultAffine => msa::CostMatrix::default_affine(gap_open),
Self::ExtendedIupac => msa::CostMatrix::extended_iupac(gap_open),
Self::Blosum62 => msa::CostMatrix::blosum62(gap_open),
}
}
}

#[allow(clippy::similar_names, clippy::cognitive_complexity, clippy::too_many_lines)]
fn main() -> Result<(), String> {
let args = Args::parse();
println!("{args:?}");

let fasta_name = args
.inp_path
.file_stem()
.ok_or("No file name found")?
.to_string_lossy()
.to_string();
let log_name = format!("musals-{fasta_name}");
// We need the `_guard` in scope to ensure proper logging.
let (_guard, log_path) = configure_logger(&log_name)?;
println!("Log file: {log_path:?}");
ftlog::info!("{args:?}");

let cost_matrix = args.cost_matrix.cost_matrix::<i32>(args.gap_open);
let aligner = msa::Aligner::new(&cost_matrix, b'-');

let out_dir = if let Some(out_dir) = args.out_dir {
if !out_dir.exists() {
std::fs::create_dir(&out_dir).map_err(|e| e.to_string())?;
}
out_dir
} else {
args.inp_path.parent().ok_or("No parent directory found")?.to_path_buf()
};
ftlog::info!("Input file: {:?}", args.inp_path);
ftlog::info!("Output directory: {out_dir:?}");

let (data, _) = bench_utils::fasta::read(&args.inp_path, 0, true)?;
let msa_data = if args.quality_only {
steps::read_aligned(&args.inp_path, &aligner)?
} else {
let data = if let Some(num_samples) = args.num_samples {
ftlog::info!("Sub-sampling dataset to {} samples.", num_samples);
let data = data.random_subsample(&mut rand::thread_rng(), num_samples);
let seq_lens = data.items().iter().map(String::len).collect::<Vec<_>>();
let (min_len, max_len, _, _, _) = bench_utils::fasta::len_stats(&seq_lens);
let name = format!("{}-{num_samples}", data.name());
let data = data
.with_dim_lower_bound(min_len)
.with_dim_upper_bound(max_len)
.with_name(&name);
ftlog::info!("Sub-sampled dataset: length range = {:?}.", data.dimensionality_hint());
data
} else {
data
};
let str_to_seq = |s: String| msa::Sequence::new(s, Some(&aligner));
let data = data.transform_items(str_to_seq);
ftlog::info!(
"Finished reading original dataset: length range = {:?}",
data.dimensionality_hint()
);

let path_manager = path_manager::PathManager::new(data.name(), &out_dir);

let metric = Levenshtein;

let msa_fasta_path = path_manager.msa_fasta_path();
if !msa_fasta_path.exists() {
// Build the MSA.
let start = std::time::Instant::now();
let msa_ball_path = path_manager.msa_ball_path();
let msa_data_path = path_manager.msa_data_path();

let (perm_ball, perm_data) = if msa_ball_path.exists() && msa_data_path.exists() {
// Read the PermutedBall and the dataset.
steps::read_permuted_ball(&msa_ball_path, &msa_data_path, &aligner)?
} else {
let ball_path = path_manager.ball_path();
let ball = if ball_path.exists() {
// Read the Ball.
steps::read_ball(&ball_path)?
} else {
steps::build_ball(&data, &metric, &ball_path, &path_manager.ball_csv_path(), args.balanced)?
};
ftlog::info!("Ball has {} leaves.", ball.leaves().len());

// Build the PermutedBall and the dataset.
steps::build_perm_ball(ball, data, &metric, &msa_ball_path, &msa_data_path)?
};
ftlog::info!("Permuted Ball has {} leaves.", perm_ball.leaves().len());

// Build the MSA.
steps::build_aligned(
&args.cost_matrix,
args.gap_open,
&perm_ball,
&perm_data,
&msa_fasta_path,
)?;
let elapsed = start.elapsed().as_secs_f32();
let msa_build_msg = format!("Finished building MSA in {elapsed:.2} seconds.");
ftlog::info!("{msa_build_msg}");
println!("{msa_build_msg}");
};

// Read the aligned sequences and load the data.
ftlog::info!("Reading aligned sequences from: {msa_fasta_path:?}");
steps::read_aligned(&msa_fasta_path, &aligner)?
};

ftlog::info!(
"Finished reading {} aligned sequences with width = {:?}.",
msa_data.cardinality(),
msa_data.dimensionality_hint()
);

// Compute the quality metrics.
let start = std::time::Instant::now();
let gap_penalty = 1;
let mismatch_penalty = 1;
let gap_open_penalty = 10;
let gap_ext_penalty = 1;

let pg_quality = msa_data.percent_gaps();
ftlog::info!("Percent gaps: {pg_quality:.4}");

let ps_quality = msa_data.par_scoring_pairwise_subsample(gap_penalty, mismatch_penalty);
ftlog::info!("Pairwise scoring metric estimate: {ps_quality}");

let wps_quality =
msa_data.par_weighted_scoring_pairwise_subsample(gap_open_penalty, gap_ext_penalty, mismatch_penalty);
ftlog::info!("Weighted pairwise scoring metric estimate: {wps_quality}");

let (avg_p, max_p) = msa_data.par_p_distance_stats_subsample();
ftlog::info!("Pairwise distance stats estimate: avg = {avg_p:.4}, max = {max_p:.4}");

let dd_quality = msa_data.par_distance_distortion_subsample();
ftlog::info!("Distance distortion metric estimate: {dd_quality}");

ftlog::info!("Converting to column-major format.");
let col_msa_data = msa_data.par_change_major();
ftlog::info!("Finished converting to column-major format.");

let cs_quality = col_msa_data.par_scoring_columns(gap_penalty, mismatch_penalty);
ftlog::info!("Column scoring metric: {cs_quality}");

let elapsed = start.elapsed().as_secs_f32();
ftlog::info!("Quality metrics took {elapsed:.2} seconds.");
println!("Quality metrics took {elapsed:.2} seconds.");

Ok(())
}
67 changes: 67 additions & 0 deletions benches/musals/src/path_manager.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
//! A helper for managing the names of files on disk.
use std::path::{Path, PathBuf};

/// A helper for managing the names of files on disk.
pub struct PathManager {
name: String,
out_dir: PathBuf,
}

impl PathManager {
/// Creates a new `PathManager`.
#[must_use]
pub fn new<P: AsRef<Path>>(name: &str, out_dir: &P) -> Self {
Self {
name: name.to_string(),
out_dir: out_dir.as_ref().to_path_buf(),
}
}

/// The name of the dataset.
#[must_use]
pub fn name(&self) -> &str {
&self.name
}

/// The directory where the dataset is stored.
#[must_use]
pub fn out_dir(&self) -> &Path {
&self.out_dir
}

/// The path to the binary file containing the `Ball` tree.
#[must_use]
pub fn ball_path(&self) -> PathBuf {
let name = format!("{}.ball", self.name());
self.out_dir().join(name)
}

/// The path to the CSV file containing the `Ball` tree.
#[must_use]
pub fn ball_csv_path(&self) -> PathBuf {
let name = format!("{}_ball.csv", self.name());
self.out_dir().join(name)
}

/// Path to file containing tree used toe making an MSA.
#[must_use]
pub fn msa_ball_path(&self) -> PathBuf {
let name = format!("{}_ball.msa", self.name());
self.out_dir().join(name)
}

/// Path to file containing containing the MSA of the dataset.
#[must_use]
pub fn msa_data_path(&self) -> PathBuf {
let name = format!("{}.msa", self.name());
self.out_dir().join(name)
}

/// Path to file containing the MSA of the dataset in FASTA format.
#[must_use]
pub fn msa_fasta_path(&self) -> PathBuf {
let name = format!("{}_msa.fasta", self.name());
self.out_dir().join(name)
}
}
Loading

0 comments on commit d8aada9

Please sign in to comment.