Skip to content

Commit

Permalink
fixed random dataset run and added lfd-plots
Browse files Browse the repository at this point in the history
  • Loading branch information
nishaq503 committed Jan 9, 2025
1 parent 84b2fb0 commit f5cd265
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 19 deletions.
2 changes: 1 addition & 1 deletion benches/cakes/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ fn main() -> Result<(), String> {
};

let (queries, subsampled_paths) =
data_gen::read_fasta_and_subsample(&inp_dir, &out_dir, false, args.num_queries, max_power, seed)?;
data_gen::read_fasta_and_subsample(&inp_dir, &out_dir, true, args.num_queries, max_power, seed)?;
let queries = queries.into_iter().map(|(_, q)| q).collect::<Vec<_>>();

ftlog::info!("Found {} sub-sampled datasets:", subsampled_paths.len());
Expand Down
25 changes: 16 additions & 9 deletions benches/cakes/src/workflow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,15 @@ where
let data = FlatVec::<Vec<f32>, usize>::read_npy(&data_path)?;

let neighbors_path = out_dir.as_ref().join(format!("{data_name}-neighbors.npy"));
let neighbors = FlatVec::<Vec<u64>, usize>::read_npy(&neighbors_path)?.take_items();
let neighbors = neighbors
.into_iter()
.map(|n| n.into_iter().map(Number::as_usize).collect::<Vec<_>>());

let distances_path = out_dir.as_ref().join(format!("{data_name}-distances.npy"));
let distances = FlatVec::<Vec<f32>, usize>::read_npy(&distances_path)?.take_items();
let (queries, neighbors) = if neighbors_path.exists() && distances_path.exists() {
let neighbors = FlatVec::<Vec<u64>, usize>::read_npy(&neighbors_path)?.take_items();
let neighbors = neighbors
.into_iter()
.map(|n| n.into_iter().map(Number::as_usize).collect::<Vec<_>>());

let distances = FlatVec::<Vec<f32>, usize>::read_npy(&distances_path)?.take_items();

let (queries, neighbors) = {
let neighbors = neighbors
.zip(distances)
.map(|(n, d)| n.into_iter().zip(d).collect::<Vec<_>>())
Expand All @@ -96,8 +96,15 @@ where
let _ = queries.split_off(num_queries);

let (queries, neighbors): (Vec<_>, Vec<_>) = queries.into_iter().unzip();
(queries, neighbors)
(queries, Some(neighbors))
} else {
let mut rng = rand::thread_rng();
let mut queries = queries.to_vec();
queries.shuffle(&mut rng);
let _ = queries.split_off(num_queries);
(queries, None)
};
let neighbors = neighbors.as_deref();

let all_paths = AllPaths::new(out_dir, data.name());
if rebuild_trees || !all_paths.all_exist() {
Expand All @@ -107,7 +114,7 @@ where
&all_paths,
metric,
&queries,
Some(&neighbors),
neighbors,
radial_fractions,
ks,
max_time,
Expand Down
112 changes: 103 additions & 9 deletions benches/py-cakes/src/py_cakes/summarize_rust.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pandas
import matplotlib.pyplot as plt
import numpy

from . import utils

Expand Down Expand Up @@ -75,16 +76,16 @@ def summarize_rust(
datasets[dataset_name].add_csv_path(cardinality, f)
ball_csv_lists[dataset_name] = list(filter(lambda x: x.stem.startswith(dataset_name), ball_csv_paths))

for dataset in datasets.values():
dataset.summarize_results(out_dir)
# for dataset in datasets.values():
# dataset.summarize_results(out_dir)

for dataset_name, paths in ball_csv_lists.items():
paths = list(filter(lambda x: "balanced" not in x.stem, paths))
paths = list(filter(lambda x: "permuted" not in x.stem, paths))
keyed_paths = [(int(path.stem[len(dataset_name) + 1 :].split("-")[0]), path) for path in paths]
keyed_paths.sort(key=lambda x: x[0])
paths = [path for _, path in keyed_paths]
plot_lfd_deciles(
plot_lfd_percentiles(
out_dir=out_dir,
dataset=dataset_name,
ball_csv_path=paths[0],
Expand Down Expand Up @@ -268,10 +269,10 @@ def plot_throughput(
# box = ax.get_position()
# ax.set_position([box.x0, box.y0, box.width, box.height * 0.8])

# # Put a legend to under the plot
# # Put a legend under the plot
# ax.legend(loc='lower center', bbox_to_anchor=(0.5, 1.1), fancybox=True)

# Tighten the layout and save the figure
# Tighten the layout
plt.tight_layout()

# Save the figure
Expand Down Expand Up @@ -343,12 +344,105 @@ def plot_distance_counts(
plt.close(fig)


def plot_lfd_deciles(
def plot_lfd_percentiles(
*,
out_dir: pathlib.Path,
dataset: str,
ball_csv_path: pathlib.Path,
):
logger.info(f" Plotting LFD Deciles {dataset}")
logger.info(f" Reading {ball_csv_path.name}")
logger.info(f" Saving to {out_dir / 'plots' / f'{dataset}_lfd_deciles.png'}")
logger.info(f" Plotting LFD Deciles {dataset = } with ball = {ball_csv_path.name}")

col_tuples = [
("minimum", 0, "tab:pink", "dotted", 0.2 * 2),
(" 5th percentile", 5, "tab:brown", "dashed", 0.3 * 2),
("25th percentile", 25, "tab:purple", "solid", 0.4 * 2),
("median", 50, "tab:red", "solid", 0.5 * 2),
("75th percentile", 75, "tab:green", "solid", 0.4 * 2),
("95th percentile", 95, "tab:orange", "dashed", 0.3 * 2),
("maximum", 100, "tab:blue", "dotted", 0.2 * 2),
]
columns = [t[0] for t in col_tuples]
percentiles = [t[1] for t in col_tuples]
colors = [t[2] for t in col_tuples]
styles = [t[3] for t in col_tuples]
widths = [t[4] for t in col_tuples]
shades_alphas = [
("blue", 0.05 * 2),
("orange", 0.1 * 2),
("green", 0.2 * 2),
("green", 0.2 * 2),
("orange", 0.1 * 2),
("blue", 0.05 * 2),
]

# We will make a new dataframe in which each row is one depth level in the
# ball tree and the columns are the percentiles of the LFD values for that
# depth level.
lfd_df = pandas.DataFrame(columns=columns)

inp_df = pandas.read_csv(ball_csv_path)
# Group by the "depth" column
for depth, group in inp_df.groupby("depth"):
if depth > 100:
continue
# Get all lfd values in the group as a list
lfd_values = group["lfd"].tolist()
# Calculate the percentile values
percentile_values = list(map(float, numpy.percentile(lfd_values, percentiles)))
# Add them to the dataframe
lfd_df.loc[depth] = percentile_values

logger.info(f" Created LFD dataframe with {lfd_df.shape[0]} rows")
logger.info(f" {lfd_df.head(10)}")

# Create a figure and axis
fig: plt.Figure
ax: plt.Axes
m = 0.8
fig, ax = plt.subplots(figsize=(6 * m, 4 * m))

# Plot the percentiles
for i, col in enumerate(columns):
ax.plot(lfd_df.index, lfd_df[col], label=col, color=colors[i], linestyle=styles[i], linewidth=widths[i])

# Shade the are between each pair of percentiles
for (y_lower, y_upper, (color, alpha)) in zip(columns[:-1], columns[1:], shades_alphas):
ax.fill_between(
lfd_df.index,
lfd_df[y_lower],
lfd_df[y_upper],
color=color,
alpha=alpha,
)

# Set the title and labels
ax.set_xlabel("Depth")
ax.set_ylabel("LFD")

# Set the y-axis limit to (0, 20)
ax.set_ylim(0, 20)
# Set the y-ticks to [2, 4, ..., 20]
y_ticks = numpy.arange(2, 21, 2)
ax.set_yticks(y_ticks)
# Add a horizontal line at each y-tick
for y in y_ticks:
ax.axhline(y, color="gray", linestyle="solid", linewidth=0.1)

# Make the top and right spines invisible
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

# Put a legend in the top right corner
ax.legend(loc="upper right", fancybox=True)

# Tighten the layout
plt.tight_layout()

# Save the figure
out_path = out_dir / "lfd" / f"{dataset}.png"
out_path.parent.mkdir(parents=False, exist_ok=True)
logger.info(f" Saving to {out_path}")
fig.savefig(out_path, dpi=300)

# Close the figure
plt.close(fig)

0 comments on commit f5cd265

Please sign in to comment.