fixed random dataset run and added lfd-plots

nishaq503 · Jan 9, 2025 · f5cd265 · f5cd265
1 parent 84b2fb0
commit f5cd265
Show file tree

Hide file tree

Showing 3 changed files with 120 additions and 19 deletions.
diff --git a/benches/cakes/src/main.rs b/benches/cakes/src/main.rs
@@ -211,7 +211,7 @@ fn main() -> Result<(), String> {
         };
 
         let (queries, subsampled_paths) =
-            data_gen::read_fasta_and_subsample(&inp_dir, &out_dir, false, args.num_queries, max_power, seed)?;
+            data_gen::read_fasta_and_subsample(&inp_dir, &out_dir, true, args.num_queries, max_power, seed)?;
         let queries = queries.into_iter().map(|(_, q)| q).collect::<Vec<_>>();
 
         ftlog::info!("Found {} sub-sampled datasets:", subsampled_paths.len());

diff --git a/benches/cakes/src/workflow.rs b/benches/cakes/src/workflow.rs
@@ -75,15 +75,15 @@ where
     let data = FlatVec::<Vec<f32>, usize>::read_npy(&data_path)?;
 
     let neighbors_path = out_dir.as_ref().join(format!("{data_name}-neighbors.npy"));
-    let neighbors = FlatVec::<Vec<u64>, usize>::read_npy(&neighbors_path)?.take_items();
-    let neighbors = neighbors
-        .into_iter()
-        .map(|n| n.into_iter().map(Number::as_usize).collect::<Vec<_>>());
-
     let distances_path = out_dir.as_ref().join(format!("{data_name}-distances.npy"));
-    let distances = FlatVec::<Vec<f32>, usize>::read_npy(&distances_path)?.take_items();
+    let (queries, neighbors) = if neighbors_path.exists() && distances_path.exists() {
+        let neighbors = FlatVec::<Vec<u64>, usize>::read_npy(&neighbors_path)?.take_items();
+        let neighbors = neighbors
+            .into_iter()
+            .map(|n| n.into_iter().map(Number::as_usize).collect::<Vec<_>>());
+
+        let distances = FlatVec::<Vec<f32>, usize>::read_npy(&distances_path)?.take_items();
 
-    let (queries, neighbors) = {
         let neighbors = neighbors
             .zip(distances)
             .map(|(n, d)| n.into_iter().zip(d).collect::<Vec<_>>())
@@ -96,8 +96,15 @@ where
         let _ = queries.split_off(num_queries);
 
         let (queries, neighbors): (Vec<_>, Vec<_>) = queries.into_iter().unzip();
-        (queries, neighbors)
+        (queries, Some(neighbors))
+    } else {
+        let mut rng = rand::thread_rng();
+        let mut queries = queries.to_vec();
+        queries.shuffle(&mut rng);
+        let _ = queries.split_off(num_queries);
+        (queries, None)
     };
+    let neighbors = neighbors.as_deref();
 
     let all_paths = AllPaths::new(out_dir, data.name());
     if rebuild_trees || !all_paths.all_exist() {
@@ -107,7 +114,7 @@ where
         &all_paths,
         metric,
         &queries,
-        Some(&neighbors),
+        neighbors,
         radial_fractions,
         ks,
         max_time,

diff --git a/benches/py-cakes/src/py_cakes/summarize_rust.py b/benches/py-cakes/src/py_cakes/summarize_rust.py
@@ -5,6 +5,7 @@
 
 import pandas
 import matplotlib.pyplot as plt
+import numpy
 
 from . import utils
 
@@ -75,16 +76,16 @@ def summarize_rust(
         datasets[dataset_name].add_csv_path(cardinality, f)
         ball_csv_lists[dataset_name] = list(filter(lambda x: x.stem.startswith(dataset_name), ball_csv_paths))
 
-    for dataset in datasets.values():
-        dataset.summarize_results(out_dir)
+    # for dataset in datasets.values():
+    #     dataset.summarize_results(out_dir)
 
     for dataset_name, paths in ball_csv_lists.items():
         paths = list(filter(lambda x: "balanced" not in x.stem, paths))
         paths = list(filter(lambda x: "permuted" not in x.stem, paths))
         keyed_paths = [(int(path.stem[len(dataset_name) + 1 :].split("-")[0]), path) for path in paths]
         keyed_paths.sort(key=lambda x: x[0])
         paths = [path for _, path in keyed_paths]
-        plot_lfd_deciles(
+        plot_lfd_percentiles(
             out_dir=out_dir,
             dataset=dataset_name,
             ball_csv_path=paths[0],
@@ -268,10 +269,10 @@ def plot_throughput(
     # box = ax.get_position()
     # ax.set_position([box.x0, box.y0, box.width, box.height * 0.8])
 
-    # # Put a legend to under the plot
+    # # Put a legend under the plot
     # ax.legend(loc='lower center', bbox_to_anchor=(0.5, 1.1), fancybox=True)
 
-    # Tighten the layout and save the figure
+    # Tighten the layout
     plt.tight_layout()
 
     # Save the figure
@@ -343,12 +344,105 @@ def plot_distance_counts(
     plt.close(fig)
 
 
-def plot_lfd_deciles(
+def plot_lfd_percentiles(
     *,
     out_dir: pathlib.Path,
     dataset: str,
     ball_csv_path: pathlib.Path,
 ):
-    logger.info(f"  Plotting LFD Deciles {dataset}")
-    logger.info(f"  Reading {ball_csv_path.name}")
-    logger.info(f"  Saving to {out_dir / 'plots' / f'{dataset}_lfd_deciles.png'}")
+    logger.info(f"  Plotting LFD Deciles {dataset = } with ball = {ball_csv_path.name}")
+
+    col_tuples = [
+        ("minimum", 0, "tab:pink", "dotted", 0.2 * 2),
+        (" 5th percentile", 5, "tab:brown", "dashed", 0.3 * 2),
+        ("25th percentile", 25, "tab:purple", "solid", 0.4 * 2),
+        ("median", 50, "tab:red", "solid", 0.5 * 2),
+        ("75th percentile", 75, "tab:green", "solid", 0.4 * 2),
+        ("95th percentile", 95, "tab:orange", "dashed", 0.3 * 2),
+        ("maximum", 100, "tab:blue", "dotted", 0.2 * 2),
+    ]
+    columns = [t[0] for t in col_tuples]
+    percentiles = [t[1] for t in col_tuples]
+    colors = [t[2] for t in col_tuples]
+    styles = [t[3] for t in col_tuples]
+    widths = [t[4] for t in col_tuples]
+    shades_alphas = [
+        ("blue", 0.05 * 2),
+        ("orange", 0.1 * 2),
+        ("green", 0.2 * 2),
+        ("green", 0.2 * 2),
+        ("orange", 0.1 * 2),
+        ("blue", 0.05 * 2),
+    ]
+
+    # We will make a new dataframe in which each row is one depth level in the
+    # ball tree and the columns are the percentiles of the LFD values for that
+    # depth level.
+    lfd_df = pandas.DataFrame(columns=columns)
+
+    inp_df = pandas.read_csv(ball_csv_path)
+    # Group by the "depth" column
+    for depth, group in inp_df.groupby("depth"):
+        if depth > 100:
+            continue
+        # Get all lfd values in the group as a list
+        lfd_values = group["lfd"].tolist()
+        # Calculate the percentile values
+        percentile_values = list(map(float, numpy.percentile(lfd_values, percentiles)))
+        # Add them to the dataframe
+        lfd_df.loc[depth] = percentile_values
+
+    logger.info(f"  Created LFD dataframe with {lfd_df.shape[0]} rows")
+    logger.info(f"  {lfd_df.head(10)}")
+
+    # Create a figure and axis
+    fig: plt.Figure
+    ax: plt.Axes
+    m = 0.8
+    fig, ax = plt.subplots(figsize=(6 * m, 4 * m))
+
+    # Plot the percentiles
+    for i, col in enumerate(columns):
+        ax.plot(lfd_df.index, lfd_df[col], label=col, color=colors[i], linestyle=styles[i], linewidth=widths[i])
+
+    # Shade the are between each pair of percentiles
+    for (y_lower, y_upper, (color, alpha)) in zip(columns[:-1], columns[1:], shades_alphas):
+        ax.fill_between(
+            lfd_df.index,
+            lfd_df[y_lower],
+            lfd_df[y_upper],
+            color=color,
+            alpha=alpha,
+        )
+
+    # Set the title and labels
+    ax.set_xlabel("Depth")
+    ax.set_ylabel("LFD")
+
+    # Set the y-axis limit to (0, 20)
+    ax.set_ylim(0, 20)
+    # Set the y-ticks to [2, 4, ..., 20]
+    y_ticks = numpy.arange(2, 21, 2)
+    ax.set_yticks(y_ticks)
+    # Add a horizontal line at each y-tick
+    for y in y_ticks:
+        ax.axhline(y, color="gray", linestyle="solid", linewidth=0.1)
+
+    # Make the top and right spines invisible
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+
+    # Put a legend in the top right corner
+    ax.legend(loc="upper right", fancybox=True)
+
+    # Tighten the layout
+    plt.tight_layout()
+
+    # Save the figure
+    out_path = out_dir / "lfd" / f"{dataset}.png"
+    out_path.parent.mkdir(parents=False, exist_ok=True)
+    logger.info(f"  Saving to {out_path}")
+    fig.savefig(out_path, dpi=300)
+
+    # Close the figure
+    plt.close(fig)