Skip to content

Commit

Permalink
Refine violin plot and optimize RDS compression
Browse files Browse the repository at this point in the history
Updated the violin plot in the PhD paper to provide a clearer
representation of the model evaluations, focusing on the worst and best
model performers based on median scores, with an IQR threshold
highlighted for significance. Also modified the saving of RDS files to
use XZ compression for efficiency. These adjustments lead to improved
clarity in model performance evaluation and more efficient storage of
large RDS files.
  • Loading branch information
franzbischoff committed Dec 30, 2023
1 parent 2e88268 commit 52fef0f
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 29 deletions.
Binary file modified output/scores_stats_model_rep.rds
Binary file not shown.
86 changes: 57 additions & 29 deletions papers/phd/FirstPaper.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -1317,18 +1317,17 @@ wrap_plots(plots, ncol = 1) + plot_annotation(
@fig-globalmodel shows the distribution of the FLOSS score of the 10% worst (left side) and 10% best models across the recordings (right side). The bluish color highlights the models with SD below 3 and IQR below 1.
```{r fig-globalmodel, eval=TRUE, echo=FALSE, fig.height=5, fig.width=10, out.width="100%"}
#| fig-cap: "Violin plot showing the distribution of the FLOSS score achieved by all tested models during the
#| inner ressample. The left half shows the models with the worst performances (10% overall), whereas
#| the right half shows the models with the best performances (10% overall).
#| The models are sorted (left-right) by the mean score (top) and by the median (below). Ties are
#| sorted by the SD and IQR, respectively. The bluish colors highlights models with an SD below 3
#| and IQR below 1."
#| fig-cap: "Violin plot showing the distribution of the FLOSS score achieved by
#| the worst (left) and the best (right) models in the inner ressample.
#| The models are sorted descending (left-right) by the median score.
#| The bluish colors highlights models with an IQR below 10."
# cores_stats_model <- all_scores |> dplyr::mutate(across(all_of(predictors_names), as.factor, .unpack = FALSE))
library(patchwork)
if (file.exists(here::here("output", "scores_stats_model_rep.rds"))) {
scores_stats_model <- readRDS(here::here("output", "scores_stats_model_rep.rds"))
} else {
predictors_names <- c("time_constraint", "regime_threshold", "mp_threshold", "window_size", "regime_landmark")
scores_stats_model <- all_scores |>
dplyr::group_by(dplyr::across(dplyr::all_of(predictors_names))) |>
dplyr::mutate(model = glue::glue("{window_size}_{time_constraint}_{mp_threshold}_{regime_threshold}_{regime_landmark}")) |>
Expand All @@ -1341,7 +1340,7 @@ if (file.exists(here::here("output", "scores_stats_model_rep.rds"))) {
mean = mean(score), max = max(score),
sd = sd(score)
)
saveRDS(scores_stats_model, file = here::here("output", "scores_stats_model_rep.rds"))
saveRDS(scores_stats_model, file = here::here("output", "scores_stats_model_rep.rds"), compress = "xz")
}
Expand All @@ -1350,37 +1349,66 @@ scores_stats_model$id_text <- (sprintf("Model_%05d", (as.numeric(as.factor(score
scores_stats_model$record <- (sprintf("%03d", (as.numeric(factor(scores_stats_model$record, labels = levels(records_factors))))))
scores_stats_model <- scores_stats_model |> dplyr::select(-model)
low <- head(sort(unique(scores_stats_model$mean)), 20)
high <- tail(sort(unique(scores_stats_model$mean)), 20)
worst_models <- scores_stats_model |>
dplyr::filter(median > quantile(median, 0.9)) |>
dplyr::group_by(id) |>
dplyr::slice_head() |>
dplyr::ungroup() |>
dplyr::arrange(desc(median)) |>
dplyr::slice_head(n = 5) |>
dplyr::pull(id)
model_mean <- scores_stats_model |>
dplyr::mutate(low_sd = sd < 3) |>
dplyr::filter(mean > high | mean < low) |>
ggplot2::ggplot(ggplot2::aes(x = reorder(reorder(id, -sd), -mean), y = score, colour = low_sd)) +
ggplot2::scale_colour_manual(values = c("FALSE" = "#ff0000c2", "TRUE" = "#0000ffb5")) +
ggplot2::geom_violin() +
ggplot2::coord_cartesian(ylim = c(0, 3)) +
worst_models_scores <- scores_stats_model |>
dplyr::mutate(low_iqr = factor(q75 - q25 < 10)) |>
dplyr::filter(id %in% worst_models) |>
ggplot2::ggplot(ggplot2::aes(
x = reorder(id, -median),
y = score, colour = low_iqr
)) +
ggplot2::geom_violin(scale = "width") +
ggplot2::theme_bw() +
ggplot2::theme(axis.text.x = ggplot2::element_text(size = 8, angle = 90, vjust = 0.5, hjust = 1)) +
ggplot2::labs(subtitle = "Ordered by Mean and SD", colour = "SD < 3", x = ggplot2::element_blank(), y = "Score distribution")
ggplot2::labs(subtitle = "Ordered by Median", colour = "IQR < 10", x = "Model ID", y = "Score distribution") +
ggplot2::scale_color_manual(values = c("#f2706d", "#3ec5c7")) +
ggplot2::scale_y_continuous(
limits = c(0, 110),
oob = scales::oob_keep
# expand = c(0.1, 0.05, 0.1, -0.1)
)
low <- head(sort(unique(scores_stats_model$median)), 20)
high <- tail(sort(unique(scores_stats_model$median)), 20)
best_models <- scores_stats_model |>
dplyr::filter(median < quantile(median, 0.1)) |>
dplyr::group_by(id) |>
dplyr::slice_head() |>
dplyr::ungroup() |>
dplyr::arrange(desc(median)) |>
dplyr::slice_tail(n = 5) |>
dplyr::pull(id)
model_median <- scores_stats_model |>
dplyr::mutate(low_iqr = q75 - q25 < 1) |>
dplyr::filter(median > high | median < low) |>
ggplot2::ggplot(ggplot2::aes(x = reorder(reorder(id, -iqr), -median), y = score, colour = low_iqr)) +
ggplot2::geom_violin() +
ggplot2::coord_cartesian(ylim = c(0, 3)) +
best_models_scores <- scores_stats_model |>
dplyr::mutate(low_iqr = factor(q75 - q25 < 10)) |>
dplyr::filter(id %in% best_models) |>
ggplot2::ggplot(ggplot2::aes(
x = reorder(id, -median),
y = score, colour = low_iqr
)) +
ggplot2::geom_violin(scale = "width") +
ggplot2::theme_bw() +
ggplot2::theme(axis.text.x = ggplot2::element_text(size = 8, angle = 90, vjust = 0.5, hjust = 1)) +
ggplot2::labs(subtitle = "Ordered by Median and IQR", colour = "IQR < 1", x = "Model ID", y = "Score distribution")
ggplot2::labs(subtitle = "Ordered by Median", colour = "IQR < 10", x = "Model ID", y = "Score distribution") +
ggplot2::scale_color_manual(values = c("#3ec5c7", "#f2706d")) +
ggplot2::scale_y_continuous(
limits = c(0, 110)
# oob = scales::oob_squish,
# expand = c(0.1, 0.05, 0.1, -0.1)
)
(model_mean / model_median) + plot_layout(guides = "auto") +
worst_models_scores + best_models_scores + plot_layout(guides = "collect") +
plot_annotation(
title = "Scores grouped by model",
theme = ggplot2::theme_bw()
theme = ggplot2::theme_bw() + ggplot2::theme(
plot.title = ggplot2::element_text(size = 20)
)
)
```
Expand Down

0 comments on commit 52fef0f

Please sign in to comment.