Skip to content

Commit

Permalink
wip: updates to musals
Browse files Browse the repository at this point in the history
  • Loading branch information
nishaq503 committed Jan 27, 2025
1 parent d8aada9 commit d78583d
Show file tree
Hide file tree
Showing 50 changed files with 70 additions and 76 deletions.
4 changes: 2 additions & 2 deletions benches/cakes/src/metric/cosine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ impl<I: AsRef<[f32]>> Metric<I, f32> for Cosine {
distances::simd::cosine_f32(a.as_ref(), b.as_ref())
}

fn name(&self) -> &str {
fn name(&self) -> &'static str {
"cosine"
}

Expand Down Expand Up @@ -89,7 +89,7 @@ impl<I: AsRef<[f64]>> Metric<I, f64> for Cosine {
distances::simd::cosine_f64(a.as_ref(), b.as_ref())
}

fn name(&self) -> &str {
fn name(&self) -> &'static str {
"cosine"
}

Expand Down
2 changes: 1 addition & 1 deletion benches/cakes/src/metric/dtw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ impl<I: AsRef<[Complex<f64>]>> Metric<I, f64> for DynamicTimeWarping {
dtw_distance(a.as_ref(), b.as_ref())
}

fn name(&self) -> &str {
fn name(&self) -> &'static str {
"dtw"
}

Expand Down
4 changes: 2 additions & 2 deletions benches/cakes/src/metric/euclidean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ impl<I: AsRef<[f32]>> Metric<I, f32> for Euclidean {
distances::simd::euclidean_f32(a.as_ref(), b.as_ref())
}

fn name(&self) -> &str {
fn name(&self) -> &'static str {
"euclidean"
}

Expand Down Expand Up @@ -89,7 +89,7 @@ impl<I: AsRef<[f64]>> Metric<I, f64> for Euclidean {
distances::simd::euclidean_f64(a.as_ref(), b.as_ref())
}

fn name(&self) -> &str {
fn name(&self) -> &'static str {
"euclidean"
}

Expand Down
2 changes: 1 addition & 1 deletion benches/cakes/src/metric/hamming.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ impl Metric<String, u32> for Hamming {
distances::strings::hamming(a, b)
}

fn name(&self) -> &str {
fn name(&self) -> &'static str {
"hamming"
}

Expand Down
2 changes: 1 addition & 1 deletion benches/cakes/src/metric/levenshtein.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ impl Metric<String, u32> for Levenshtein {
stringzilla::sz::edit_distance(a, b).as_u32()
}

fn name(&self) -> &str {
fn name(&self) -> &'static str {
"levenshtein"
}

Expand Down
33 changes: 14 additions & 19 deletions benches/musals/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

use std::path::PathBuf;

use abd_clam::{metric::Levenshtein, msa, Cluster, Dataset};
use abd_clam::{metric::Levenshtein, musals, Cluster, Dataset};
use bench_utils::configure_logger;
use clap::Parser;
use distances::Number;
Expand All @@ -39,8 +39,8 @@ struct Args {
num_samples: Option<usize>,

/// The cost matrix to use for the alignment.
#[arg(short('m'), long)]
cost_matrix: CostMatrix,
#[arg(short('m'), long, default_value = "default")]
matrix: CostMatrix,

/// Optional cost of opening a gap, if using an affine gap penalty. If not
/// provided, a flat gap penalty is used.
Expand All @@ -55,7 +55,8 @@ struct Args {
#[arg(short('o'), long)]
out_dir: Option<PathBuf>,

/// Whether to only compute the quality metrics.
/// Whether to only compute the quality metrics. This will assume that the
/// input file contains aligned sequences.
#[arg(short('q'), long)]
quality_only: bool,
}
Expand All @@ -82,12 +83,12 @@ pub enum CostMatrix {
impl CostMatrix {
/// Get the cost matrix.
#[must_use]
pub fn cost_matrix<T: Number + core::ops::Neg<Output = T>>(&self, gap_open: Option<usize>) -> msa::CostMatrix<T> {
pub fn cost_matrix<T: Number + core::ops::Neg<Output = T>>(&self, gap_open: Option<usize>) -> musals::CostMatrix<T> {
match self {
Self::Default => msa::CostMatrix::default(),
Self::DefaultAffine => msa::CostMatrix::default_affine(gap_open),
Self::ExtendedIupac => msa::CostMatrix::extended_iupac(gap_open),
Self::Blosum62 => msa::CostMatrix::blosum62(gap_open),
Self::Default => musals::CostMatrix::default(),
Self::DefaultAffine => musals::CostMatrix::default_affine(gap_open),
Self::ExtendedIupac => musals::CostMatrix::extended_iupac(gap_open),
Self::Blosum62 => musals::CostMatrix::blosum62(gap_open),
}
}
}
Expand All @@ -109,8 +110,8 @@ fn main() -> Result<(), String> {
println!("Log file: {log_path:?}");
ftlog::info!("{args:?}");

let cost_matrix = args.cost_matrix.cost_matrix::<i32>(args.gap_open);
let aligner = msa::Aligner::new(&cost_matrix, b'-');
let cost_matrix = args.matrix.cost_matrix::<i32>(args.gap_open);
let aligner = musals::Aligner::new(&cost_matrix, b'-');

let out_dir = if let Some(out_dir) = args.out_dir {
if !out_dir.exists() {
Expand Down Expand Up @@ -142,7 +143,7 @@ fn main() -> Result<(), String> {
} else {
data
};
let str_to_seq = |s: String| msa::Sequence::new(s, Some(&aligner));
let str_to_seq = |s: String| musals::Sequence::new(s, Some(&aligner));
let data = data.transform_items(str_to_seq);
ftlog::info!(
"Finished reading original dataset: length range = {:?}",
Expand Down Expand Up @@ -179,13 +180,7 @@ fn main() -> Result<(), String> {
ftlog::info!("Permuted Ball has {} leaves.", perm_ball.leaves().len());

// Build the MSA.
steps::build_aligned(
&args.cost_matrix,
args.gap_open,
&perm_ball,
&perm_data,
&msa_fasta_path,
)?;
steps::build_aligned(&args.matrix, args.gap_open, &perm_ball, &perm_data, &msa_fasta_path)?;
let elapsed = start.elapsed().as_secs_f32();
let msa_build_msg = format!("Finished building MSA in {elapsed:.2} seconds.");
ftlog::info!("{msa_build_msg}");
Expand Down
12 changes: 6 additions & 6 deletions benches/musals/src/steps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use abd_clam::{
cluster::{adapter::ParBallAdapter, BalancedBall, ClusterIO, Csv, ParPartition},
dataset::{AssociatesMetadata, AssociatesMetadataMut, DatasetIO},
metric::ParMetric,
msa::{self, Aligner, Sequence},
musals::{Aligner, Columns, Sequence, MSA},
Ball, Cluster, Dataset, FlatVec,
};

Expand All @@ -25,18 +25,18 @@ pub fn build_aligned<P: AsRef<Path>>(
ftlog::info!("Setting up aligner...");
let gap = b'-';
let cost_matrix = matrix.cost_matrix::<i32>(gap_open);
let aligner = msa::Aligner::new(&cost_matrix, gap);
let aligner = Aligner::new(&cost_matrix, gap);

ftlog::info!("Aligning sequences...");
let builder = msa::Columns::new(gap).par_with_tree(perm_ball, data, &aligner);
let builder = Columns::new(gap).par_with_tree(perm_ball, data, &aligner);

ftlog::info!("Extracting aligned sequences...");
let msa = builder.to_flat_vec_rows().with_metadata(data.metadata())?;
let transformer = |s: Vec<u8>| s.into_iter().map(|c| c as char).collect::<String>();
let msa = msa.transform_items(transformer);

ftlog::info!("Finished aligning {} sequences.", builder.len());
let data = msa::MSA::new(&aligner, msa)?;
let data = MSA::new(&aligner, msa)?;

ftlog::info!("Writing MSA to {:?}", out_path.as_ref());
bench_utils::fasta::write(&data, out_path)?;
Expand All @@ -45,10 +45,10 @@ pub fn build_aligned<P: AsRef<Path>>(
}

/// Read the aligned fasta file.
pub fn read_aligned<P: AsRef<Path>>(path: &P, aligner: &Aligner<i32>) -> Result<msa::MSA<String, i32, String>, String> {
pub fn read_aligned<P: AsRef<Path>>(path: &P, aligner: &Aligner<i32>) -> Result<MSA<String, i32, String>, String> {
ftlog::info!("Reading aligned sequences from {:?}", path.as_ref());
let (data, _) = bench_utils::fasta::read(path, 0, false)?;
msa::MSA::new(aligner, data)
MSA::new(aligner, data)
}

/// Build the `PermutedBall` and the permuted dataset.
Expand Down
2 changes: 1 addition & 1 deletion benches/utils/src/fasta/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ pub fn read<P: AsRef<Path>>(
return Err(format!("Path {path:?} does not exist!"));
}

if !path.extension().map_or(false, |ext| ext == "fasta") {
if path.extension().is_none_or(|ext| ext != "fasta") {
return Err(format!("Path {path:?} does not have the `.fasta` extension!"));
}

Expand Down
4 changes: 2 additions & 2 deletions benches/utils/src/metrics/dtw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ impl<I: AsRef<[Complex<f32>]>> Metric<I, f32> for Dtw {
dtw_distance(a.as_ref(), b.as_ref())
}

fn name(&self) -> &str {
fn name(&self) -> &'static str {
"dtw"
}

Expand Down Expand Up @@ -93,7 +93,7 @@ impl<I: AsRef<[Complex<f64>]>> Metric<I, f64> for Dtw {
dtw_distance(a.as_ref(), b.as_ref())
}

fn name(&self) -> &str {
fn name(&self) -> &'static str {
"dtw"
}

Expand Down
2 changes: 1 addition & 1 deletion benches/utils/src/metrics/jaccard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ impl<I: AsRef<[usize]>> Metric<I, f32> for Jaccard {
distances::sets::jaccard(a.as_ref(), b.as_ref())
}

fn name(&self) -> &str {
fn name(&self) -> &'static str {
"jaccard"
}

Expand Down
2 changes: 1 addition & 1 deletion benches/utils/src/metrics/levenshtein.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ impl<I: AsRef<[u8]>, T: Number> Metric<I, T> for Levenshtein {
T::from(stringzilla::sz::edit_distance(a.as_ref(), b.as_ref()))
}

fn name(&self) -> &str {
fn name(&self) -> &'static str {
"levenshtein"
}

Expand Down
6 changes: 3 additions & 3 deletions crates/abd-clam/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ statistical = "1.0.0"
disk-io = ["dep:serde", "dep:csv", "dep:bitcode", "dep:flate2", "dep:ndarray", "dep:ndarray-npy"]
chaoda = ["dep:smartcore"]
mbed = ["chaoda"]
msa = ["dep:stringzilla", "dep:bytecount"]
all = ["disk-io", "chaoda", "mbed", "msa"]
musals = ["dep:stringzilla", "dep:bytecount"]
all = ["disk-io", "chaoda", "mbed", "musals"]

[[bench]]
name = "vector_search"
Expand All @@ -77,7 +77,7 @@ harness = false
[[bench]]
name = "genomic_search"
harness = false
required-features = ["msa"]
required-features = ["musals"]

[[bench]]
name = "ann_benchmarks"
Expand Down
4 changes: 2 additions & 2 deletions crates/abd-clam/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ This crate provides the following features:

- `disk-io`: Enables easy IO for several structs, primarily using `bitcode` and `serde`.
- `chaoda`: Enables anomaly detection using the CHAODA.
- `msa`: Enables multiple sequence alignment.
- `musals`: Enables multiple sequence alignment.
- `mbed`: Enables dimensionality reduction algorithms.
- `all`: Enables all features.

Expand Down Expand Up @@ -103,7 +103,7 @@ use abd_clam::{
cluster::{adapter::ParBallAdapter, ClusterIO, ParPartition},
dataset::{AssociatesMetadataMut, DatasetIO},
metric::Levenshtein,
msa::{Aligner, CostMatrix, Sequence},
musals::{Aligner, CostMatrix, Sequence},
pancakes::{CodecData, SquishyBall},
Ball, Cluster, Dataset, FlatVec,
};
Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/benches/genomic_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use abd_clam::{
cluster::{adapter::ParBallAdapter, BalancedBall, ParPartition},
dataset::{AssociatesMetadata, AssociatesMetadataMut},
metric::Levenshtein,
msa::{Aligner, CostMatrix, Sequence},
musals::{Aligner, CostMatrix, Sequence},
pancakes::SquishyBall,
Ball, Cluster, Dataset, FlatVec,
};
Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/src/cakes/search/knn_breadth_first.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pub struct KnnBreadthFirst(pub usize);
impl<I, T: Number, C: Cluster<T>, M: Metric<I, T>, D: Searchable<I, T, C, M>> SearchAlgorithm<I, T, C, M, D>
for KnnBreadthFirst
{
fn name(&self) -> &str {
fn name(&self) -> &'static str {
"KnnBreadthFirst"
}

Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/src/cakes/search/knn_depth_first.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ pub struct KnnDepthFirst(pub usize);
impl<I, T: Number, C: Cluster<T>, M: Metric<I, T>, D: Searchable<I, T, C, M>> SearchAlgorithm<I, T, C, M, D>
for KnnDepthFirst
{
fn name(&self) -> &str {
fn name(&self) -> &'static str {
"KnnDepthFirst"
}

Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/src/cakes/search/knn_hinted.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ where
M: Metric<I, T>,
D: HintedDataset<I, T, C, M>,
{
fn name(&self) -> &str {
fn name(&self) -> &'static str {
"KnnHinted"
}

Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/src/cakes/search/knn_linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ pub struct KnnLinear(pub usize);
impl<I, T: Number, C: Cluster<T>, M: Metric<I, T>, D: Searchable<I, T, C, M>> SearchAlgorithm<I, T, C, M, D>
for KnnLinear
{
fn name(&self) -> &str {
fn name(&self) -> &'static str {
"KnnLinear"
}

Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/src/cakes/search/knn_repeated_rnn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ pub struct KnnRepeatedRnn<T: Number>(pub usize, pub T);
impl<I, T: Number, C: Cluster<T>, M: Metric<I, T>, D: Searchable<I, T, C, M>> SearchAlgorithm<I, T, C, M, D>
for KnnRepeatedRnn<T>
{
fn name(&self) -> &str {
fn name(&self) -> &'static str {
"KnnRepeatedRnn"
}

Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/src/cakes/search/rnn_clustered.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pub struct RnnClustered<T: Number>(pub T);
impl<I, T: Number, C: Cluster<T>, M: Metric<I, T>, D: Searchable<I, T, C, M>> SearchAlgorithm<I, T, C, M, D>
for RnnClustered<T>
{
fn name(&self) -> &str {
fn name(&self) -> &'static str {
"RnnClustered"
}

Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/src/cakes/search/rnn_linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ pub struct RnnLinear<T: Number>(pub T);
impl<I, T: Number, C: Cluster<T>, M: Metric<I, T>, D: Searchable<I, T, C, M>> SearchAlgorithm<I, T, C, M, D>
for RnnLinear<T>
{
fn name(&self) -> &str {
fn name(&self) -> &'static str {
"RnnLinear"
}

Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/src/chaoda/training/algorithms/cc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use super::GraphEvaluator;
pub struct ClusterCardinality;

impl<T: Number, S: Cluster<T>> GraphEvaluator<T, S> for ClusterCardinality {
fn name(&self) -> &str {
fn name(&self) -> &'static str {
"cc"
}

Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/src/chaoda/training/algorithms/gn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ impl GraphNeighborhood {
}

impl<T: Number, S: Cluster<T>> GraphEvaluator<T, S> for GraphNeighborhood {
fn name(&self) -> &str {
fn name(&self) -> &'static str {
"gn"
}

Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/src/chaoda/training/algorithms/pc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use super::GraphEvaluator;
pub struct ParentCardinality;

impl<T: Number, S: Cluster<T>> GraphEvaluator<T, S> for ParentCardinality {
fn name(&self) -> &str {
fn name(&self) -> &'static str {
"pc"
}

Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/src/chaoda/training/algorithms/sc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use super::GraphEvaluator;
pub struct SubgraphCardinality;

impl<T: Number, S: Cluster<T>> GraphEvaluator<T, S> for SubgraphCardinality {
fn name(&self) -> &str {
fn name(&self) -> &'static str {
"sc"
}

Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/src/chaoda/training/algorithms/sp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ impl StationaryProbability {
}

impl<T: Number, S: Cluster<T>> GraphEvaluator<T, S> for StationaryProbability {
fn name(&self) -> &str {
fn name(&self) -> &'static str {
"sp"
}

Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/src/chaoda/training/algorithms/vd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use super::GraphEvaluator;
pub struct VertexDegree;

impl<T: Number, S: Cluster<T>> GraphEvaluator<T, S> for VertexDegree {
fn name(&self) -> &str {
fn name(&self) -> &'static str {
"vd"
}

Expand Down
Loading

0 comments on commit d78583d

Please sign in to comment.