From 5eba6e79dda8bb7bd3bf7c9f91e5da3f41d71cb0 Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Sun, 12 Nov 2023 13:41:23 -0500 Subject: [PATCH] feat: silva benches now save/load Cakes --- cakes-results/src/genomic/main.rs | 310 +++++++++++++++++++----- cakes-results/src/genomic/read_silva.rs | 41 +++- 2 files changed, 286 insertions(+), 65 deletions(-) diff --git a/cakes-results/src/genomic/main.rs b/cakes-results/src/genomic/main.rs index 7e822a9af..39b4d5731 100644 --- a/cakes-results/src/genomic/main.rs +++ b/cakes-results/src/genomic/main.rs @@ -26,13 +26,12 @@ use std::{ time::Instant, }; -use abd_clam::{Cakes, Dataset, Instance}; +use abd_clam::{Cakes, Dataset, Instance, VecDataset}; use clap::Parser; use distances::Number; use log::{debug, info, warn}; use serde::{Deserialize, Serialize}; -#[allow(clippy::too_many_lines)] fn main() -> Result<(), String> { env_logger::Builder::from_default_env() .filter_level(log::LevelFilter::Info) @@ -40,6 +39,14 @@ fn main() -> Result<(), String> { let args = Args::parse(); + // Parse the metric. + let metric = Metric::from_str(&args.metric)?; + let metric_name = metric.name(); + info!("Using metric: {metric_name}"); + + let is_expensive = metric.is_expensive(); + let metric = metric.metric(); + // Check that the data set exists. let stem = "silva-SSU-Ref"; let data_paths = [ @@ -56,19 +63,11 @@ fn main() -> Result<(), String> { info!("Using data from {unaligned_path:?}."); // Check that the output directory exists. - let output_dir = args.output_dir; + let output_dir = &args.output_dir; if !output_dir.exists() { return Err(format!("Output directory {output_dir:?} does not exist.")); } - // Parse the metric. - let metric = Metric::from_str(&args.metric)?; - let metric_name = metric.name(); - info!("Using metric: {metric_name}"); - - let is_expensive = metric.is_expensive(); - let metric = metric.metric(); - let [train, queries, train_headers, query_headers] = read_silva::silva_to_dataset(&unaligned_path, &headers_path, metric, is_expensive)?; info!( @@ -87,85 +86,193 @@ fn main() -> Result<(), String> { let queries = queries.data().iter().collect::>(); let query_headers = query_headers.data().iter().collect::>(); - let seed = args.seed; - let criteria = abd_clam::PartitionCriteria::default(); + // Check if the cakes data structure already exists. + let cakes_dir = args.input_dir.join(format!("{stem}-cakes")); + let (cakes, built, build_time) = if cakes_dir.exists() { + info!("Loading search tree from {cakes_dir:?} ..."); + let start = Instant::now(); + let cakes = Cakes::load(&cakes_dir, metric, is_expensive)?; + let load_time = start.elapsed().as_secs_f32(); + info!("Loaded search tree in {load_time:.2e} seconds."); - info!("Creating search tree ..."); - let start = Instant::now(); - let mut cakes = Cakes::new(train, seed, &criteria); - let build_time = start.elapsed().as_secs_f32(); - info!("Created search tree in {build_time:.2e} seconds."); + let tuned_algorithm = cakes.tuned_knn_algorithm(); + let tuned_algorithm = tuned_algorithm.name(); + info!("Tuned algorithm is {tuned_algorithm}"); - let tuning_depth = args.tuning_depth; - let tuning_k = args.tuning_k; - info!("Tuning knn-search with k {tuning_k} and depth {tuning_depth} ..."); + (cakes, false, load_time) + } else { + let seed = args.seed; + let criteria = abd_clam::PartitionCriteria::default(); - let start = Instant::now(); - cakes.auto_tune_knn(tuning_depth, tuning_k); - let tuning_time = start.elapsed().as_secs_f32(); - info!("Tuned knn-search in {tuning_time:.2e} seconds."); + info!("Creating search tree ..."); + let start = Instant::now(); + let mut cakes = Cakes::new(train, seed, &criteria); + let build_time = start.elapsed().as_secs_f32(); + info!("Created search tree in {build_time:.2e} seconds."); + let tuning_depth = args.tuning_depth; + let tuning_k = args.tuning_k; + info!("Tuning knn-search with k {tuning_k} and depth {tuning_depth} ..."); + + let start = Instant::now(); + cakes.auto_tune_knn(tuning_depth, tuning_k); + let tuning_time = start.elapsed().as_secs_f32(); + info!("Tuned knn-search in {tuning_time:.2e} seconds."); + + let tuned_algorithm = cakes.tuned_knn_algorithm(); + let tuned_algorithm = tuned_algorithm.name(); + info!("Tuned algorithm is {tuned_algorithm}"); + + // Save the Cakes data structure. + std::fs::create_dir(&cakes_dir).map_err(|e| e.to_string())?; + cakes.save(&cakes_dir)?; + + (cakes, true, build_time + tuning_time) + }; + + measure_throughput( + &cakes, + built, + build_time, + &args, + &queries, + &query_headers, + &train_headers, + stem, + metric_name, + output_dir, + ) +} + +/// Measure the throughput of the tuned algorithm. +/// +/// # Arguments +/// +/// * `cakes`: The cakes data structure. +/// * `built`: Whether the cakes data structure was built or loaded from disk. +/// * `build_time`: The time taken to build the cakes data structure or load it +/// from disk. +/// * `args`: The command line arguments. +/// * `queries`: The queries. +/// * `query_headers`: The headers of the queries. +/// * `train_headers`: The headers of the training set. +/// * `stem`: The stem of the data set name. +/// * `metric_name`: The name of the metric. +/// * `output_dir`: The output directory. +/// +/// # Errors +/// +/// * If the `output_dir` does not exist or cannot be written to. +#[allow(clippy::too_many_arguments)] +fn measure_throughput( + cakes: &Cakes>, + built: bool, + build_time: f32, + args: &Args, + queries: &[&String], + query_headers: &[&String], + train_headers: &VecDataset, + stem: &str, + metric_name: &str, + output_dir: &Path, +) -> Result<(), String> { let tuned_algorithm = cakes.tuned_knn_algorithm(); let tuned_algorithm = tuned_algorithm.name(); - info!("Tuned algorithm is {tuned_algorithm}"); let train = cakes.shards()[0]; // Perform knn-search for each value of k on all queries. - for k in args.ks { + for &k in &args.ks { info!("Starting knn-search with k = {k} ..."); // Run the tuned algorithm. let start = Instant::now(); - let results = cakes.batch_tuned_knn_search(&queries, k); + let results = cakes.batch_tuned_knn_search(queries, k); let search_time = start.elapsed().as_secs_f32(); let throughput = queries.len().as_f32() / search_time; info!("With k = {k}, achieved throughput of {throughput:.2e} QPS."); // Run the linear search algorithm. let start = Instant::now(); - let linear_results = cakes.batch_linear_knn_search(&queries, k); + let linear_results = cakes.batch_linear_knn_search(queries, k); let linear_search_time = start.elapsed().as_secs_f32(); let linear_throughput = queries.len().as_f32() / linear_search_time; info!("With k = {k}, achieved linear search throughput of {linear_throughput:.2e} QPS.",); - // Compute the recall of the tuned algorithm. - let mean_recall = results - .iter() - .zip(linear_results) - .map(|(hits, linear_hits)| compute_recall(hits.clone(), linear_hits)) - .sum::() - / queries.len().as_f32(); + let (mean_recall, hits) = recall_and_hits( + results, + linear_results, + queries, + query_headers, + train_headers, + train, + ); info!("With k = {k}, achieved mean recall of {mean_recall:.3}."); - // Convert results to original indices. - let hits = results.into_iter().map(|hits| { - hits.into_iter() - .map(|(index, distance)| (train.original_index(index), distance)) - .collect::>() - }); - - // Collect query header, hit headers, and hit distances. - let hits = hits - .zip(query_headers.iter()) - .map(|(hits, &query)| { - ( - query.clone(), - hits.into_iter() - .map(|(index, distance)| (train_headers[index].clone(), distance)) - .collect::>(), - ) - }) - .collect::>(); + // Create the report. + let report = Report { + dataset: stem, + metric: metric_name, + cardinality: train.cardinality(), + built, + build_time, + shard_sizes: cakes.shard_cardinalities(), + num_queries: queries.len(), + kind: "knn", + val: k, + tuned_algorithm, + throughput, + linear_throughput, + hits, + mean_recall, + }; + + // Save the report. + report.save(output_dir)?; + } + + // Perform range search for each value of r on all queries. + for &r in &args.rs { + info!("Starting range search with r = {r} ..."); + + #[allow(clippy::cast_possible_truncation)] + let radius = r as u32; + + // Run the tuned algorithm. + let start = Instant::now(); + let results = cakes.batch_tuned_rnn_search(queries, radius); + let search_time = start.elapsed().as_secs_f32(); + let throughput = queries.len().as_f32() / search_time; + info!("With r = {r}, achieved throughput of {throughput:.2e} QPS."); + + // Run the linear search algorithm. + let start = Instant::now(); + let linear_results = cakes.batch_linear_rnn_search(queries, radius); + let linear_search_time = start.elapsed().as_secs_f32(); + let linear_throughput = queries.len().as_f32() / linear_search_time; + info!("With r = {r}, achieved linear search throughput of {linear_throughput:.2e} QPS.",); + + let (mean_recall, hits) = recall_and_hits( + results, + linear_results, + queries, + query_headers, + train_headers, + train, + ); + info!("With r = {r}, achieved mean recall of {mean_recall:.3}."); // Create the report. let report = Report { dataset: stem, metric: metric_name, cardinality: train.cardinality(), + built, + build_time, shard_sizes: cakes.shard_cardinalities(), num_queries: queries.len(), - k, + kind: "rnn", + val: r, tuned_algorithm, throughput, linear_throughput, @@ -174,12 +281,67 @@ fn main() -> Result<(), String> { }; // Save the report. - report.save(&output_dir)?; + report.save(output_dir)?; } Ok(()) } +/// Compute the recall and hits of the tuned algorithm. +/// +/// # Arguments +/// +/// * `results`: The results of the tuned algorithm. +/// * `linear_results`: The results of linear search. +/// * `queries`: The queries. +/// * `query_headers`: The headers of the queries. +/// * `train_headers`: The headers of the training set. +/// * `train`: The training set. +/// +/// # Returns +/// +/// * The mean recall of the tuned algorithm. +/// * The headers of hits of the tuned algorithm. +#[allow(clippy::type_complexity)] +fn recall_and_hits( + results: Vec>, + linear_results: Vec>, + queries: &[&String], + query_headers: &[&String], + train_headers: &VecDataset, + train: &VecDataset, +) -> (f32, Vec<(String, Vec<(String, u32)>)>) { + // Compute the recall of the tuned algorithm. + let mean_recall = results + .iter() + .zip(linear_results) + .map(|(hits, linear_hits)| compute_recall(hits.clone(), linear_hits)) + .sum::() + / queries.len().as_f32(); + + // Convert results to original indices. + let hits = results.into_iter().map(|hits| { + hits.into_iter() + .map(|(index, distance)| (train.original_index(index), distance)) + .collect::>() + }); + + // Collect query header, hit headers, and hit distances. + let hits = hits + .zip(query_headers.iter()) + .map(|(hits, &query)| { + ( + query.clone(), + hits.into_iter() + .map(|(index, distance)| (train_headers[index].clone(), distance)) + .collect::>(), + ) + }) + .collect::>(); + + (mean_recall, hits) +} + /// CLI arguments for the genomic benchmarks. #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] @@ -205,6 +367,9 @@ struct Args { /// Number of nearest neighbors to search for. #[arg(long, value_parser, num_args = 1.., value_delimiter = ' ', default_value = "10 100")] ks: Vec, + /// Radii to use for range search. + #[arg(long, value_parser, num_args = 1.., value_delimiter = ' ', default_value = "25 100 250")] + rs: Vec, /// Seed for the random number generator. #[arg(long)] seed: Option, @@ -287,17 +452,22 @@ struct Report<'a> { metric: &'a str, /// Number of data points in the data set. cardinality: usize, + /// Whether the search tree was built or loaded from disk. + built: bool, + /// Time taken to build the search tree or load it from disk. + build_time: f32, /// Sizes of the shards created for `ShardedCakes`. shard_sizes: Vec, /// Number of queries used for search. num_queries: usize, - /// Number of nearest neighbors to search for. - k: usize, + /// The kind of search performed. One of "knn" or "rnn". + kind: &'a str, + /// Value of k used for knn-search or value of r used for range search. + val: usize, /// Name of the algorithm used after auto-tuning. tuned_algorithm: &'a str, /// Throughput of the tuned algorithm. throughput: f32, - // TODO: Include linear search throughput. /// Throughput of linear search. linear_throughput: f32, /// Hits for each query. @@ -308,8 +478,25 @@ struct Report<'a> { impl Report<'_> { /// Save the report to a file in the given directory. + /// + /// # Arguments + /// + /// * `dir`: The directory to save the report to. + /// + /// # Errors + /// + /// * If the directory does not exist or cannot be written to. + /// * If the report cannot be serialized. fn save(&self, dir: &Path) -> Result<(), String> { - let path = dir.join(format!("{}_{}.json", self.dataset, self.k)); + if !dir.exists() { + return Err(format!("Directory {dir:?} does not exist.")); + } + + if !dir.is_dir() { + return Err(format!("{dir:?} is not a directory.")); + } + + let path = dir.join(format!("{}_{}_{}.json", self.dataset, self.kind, self.val)); let report = serde_json::to_string_pretty(&self).map_err(|e| e.to_string())?; std::fs::write(path, report).map_err(|e| e.to_string())?; Ok(()) @@ -332,7 +519,6 @@ pub fn compute_recall( mut linear_hits: Vec<(usize, U)>, ) -> f32 { if linear_hits.is_empty() { - warn!("Linear search was too slow. Skipping recall computation."); 1.0 } else { let (num_hits, num_linear_hits) = (hits.len(), linear_hits.len()); diff --git a/cakes-results/src/genomic/read_silva.rs b/cakes-results/src/genomic/read_silva.rs index 1ef892ece..704d0eecf 100644 --- a/cakes-results/src/genomic/read_silva.rs +++ b/cakes-results/src/genomic/read_silva.rs @@ -7,6 +7,7 @@ use std::{ }; use abd_clam::{Dataset, VecDataset}; +use distances::Number; use log::info; use rand::prelude::*; @@ -64,6 +65,31 @@ pub fn silva_to_dataset( sequences.len() ); + let seq_lengths = sequences.iter().map(String::len).collect::>(); + let min_len = seq_lengths + .iter() + .min() + .unwrap_or_else(|| unreachable!("No sequences!")); + let max_len = seq_lengths + .iter() + .max() + .unwrap_or_else(|| unreachable!("No sequences!")); + let (mean, std_dev) = { + let sum = seq_lengths.iter().sum::().as_f64(); + let mean = sum / seq_lengths.len().as_f64(); + let variance = seq_lengths + .iter() + .map(|&len| (len.as_f64() - mean).powi(2)) + .sum::() + / seq_lengths.len().as_f64(); + let std_dev = variance.sqrt(); + (mean, std_dev) + }; + info!("Minimum sequence length: {min_len}"); + info!("Maximum sequence length: {max_len}"); + info!("Mean sequence length: {mean:.3}"); + info!("Standard deviation of sequence length: {std_dev:.3}"); + // Read the headers file. let file = File::open(headers_path) .map_err(|e| format!("Could not open file {headers_path:?}: {e}"))?; @@ -76,12 +102,21 @@ pub fn silva_to_dataset( // join the lines and headers into a single vector of (line, header) pairs. let mut sequences = sequences.into_iter().zip(headers).collect::>(); - sequences.shuffle(&mut thread_rng()); + + // Seed the random number generator. + let mut rng = rand::rngs::StdRng::seed_from_u64(42); + sequences.shuffle(&mut rng); info!("Shuffled sequences and headers."); + // TODO: Remove this for the run on Ark + // Keep 1100 sequences + sequences.truncate(1100); + + // TODO: Change num_queries to 1000 for the run on Ark // Split the lines into the training and query sets. - let train = sequences.split_off(1000); - let (train, train_headers): (Vec<_>, Vec<_>) = train.into_iter().unzip(); + let num_queries = 100; + let (train, train_headers): (Vec<_>, Vec<_>) = + sequences.split_off(num_queries).into_iter().unzip(); let train = VecDataset::new(format!("{stem}-queries"), train, metric, is_expensive); let train_headers = VecDataset::new( format!("{stem}-train-headers"),