From 12675eadf335b110cf067124470af63c5e8a2ef1 Mon Sep 17 00:00:00 2001
From: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com>
Date: Fri, 6 Dec 2024 09:32:15 -0500
Subject: [PATCH 1/6] Filter out 10x samples without subgroup label in gather
 metadata

---
 01-gather_metadata.R | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/01-gather_metadata.R b/01-gather_metadata.R
index a4291a6..dc9a1b5 100644
--- a/01-gather_metadata.R
+++ b/01-gather_metadata.R
@@ -223,6 +223,8 @@ GSE119926_metadata <- GEOquery::getGEO(filename = GSE119926_metadata_input_filen
 # Metadata sample 966-recurrence likely matches data file 966-2.tsv
 # Metadata sample 934-repeat MAY match data file 943.tsv, since that's the only remaining non-matching file
 # We remove them here to be conservative.
+# We also remove any samples that have NA for a subgroup, as we can not
+# test models without a subgroup label.
 
 GSE155446_metadata <- readr::read_csv(GSE155446_metadata_input_filename,
                                       col_types = "c") |>
@@ -237,7 +239,8 @@ GSE155446_metadata <- readr::read_csv(GSE155446_metadata_input_filename,
                 subtype = NA) |>
   dplyr::filter(sample_accession != "966-recurrence",
                 sample_accession != "934-repeat") |>
-  clean_mb_subgroups()
+  clean_mb_subgroups() |>
+  dplyr::filter(!is.na(subgroup))
 
 ################################################################################
 # combine bulk metadata and write to file

From ec88cf9dcaae092d307921a053943fd10306b9e7 Mon Sep 17 00:00:00 2001
From: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com>
Date: Fri, 6 Dec 2024 14:39:50 -0500
Subject: [PATCH 2/6] WIP: Add 10X to prediction

---
 predict/predict_pseudobulk_and_single_cells.R | 43 ++++++++++++++-----
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/predict/predict_pseudobulk_and_single_cells.R b/predict/predict_pseudobulk_and_single_cells.R
index 477789f..0e0627a 100644
--- a/predict/predict_pseudobulk_and_single_cells.R
+++ b/predict/predict_pseudobulk_and_single_cells.R
@@ -16,12 +16,18 @@ models_dir <- here::here("models")
 plots_dir <- here::here("plots")
 plots_data_dir <- here::here(plots_dir, "data")
 processed_data_dir <- here::here("processed_data")
-pseudobulk_sce_dir <- here::here(processed_data_dir, "pseudobulk_sce")
+pseudobulk_data_dir <- here::here(processed_data_dir, "pseudobulk")
+smartseq_data_dir <- here::here(pseudobulk_data_dir, "GSE119926")
+tenx_data_dir <- here::here(pseudobulk_data_dir, "GSE155446")
 
 # Input files
 # pseudobulk data for each sample was generated in 02-gather_data.R
-pseudobulk_genex_filepath <- here::here(processed_data_dir, "pseudobulk_genex.tsv")
-pseudobulk_metadata_filepath <- here::here(processed_data_dir, "pseudobulk_metadata.tsv")
+singlecell_metadata_filepath <- here::here(pseudobulk_data_dir,
+                                           "pseudobulk_metadata.tsv")
+smartseq_genex_filepath <- here::here(smartseq_data_dir,
+                                      "GSE119926_pseudobulk_genex.tsv")
+tenx_genex_filepath <- here::here(tenx_data_dir,
+                                  "GSE155446_pseudobulk_genex.tsv")
 
 # baseline models are generated prior to this in the notebook analysis_notebooks/baseline_models.Rmd
 # there is one baseline model designated as the official model that is used to test single cells here
@@ -34,14 +40,22 @@ pseudobulk_plot_data_filepath <- here::here(plots_data_dir, "pseudobulk_test_pre
 
 # Read in data
 
-pseudobulk_genex_df <- readr::read_tsv(pseudobulk_genex_filepath,
+smartseq_genex_df <- readr::read_tsv(smartseq_genex_filepath,
                                        show_col_types = FALSE) |>
   tibble::column_to_rownames(var = "gene")
+tenx_genex_df <- readr::read_tsv(tenx_genex_filepath,
+                                 show_col_types = FALSE) |>
+  tibble::column_to_rownames(var = "gene")
 
-pseudobulk_metadata_df <- readr::read_tsv(pseudobulk_metadata_filepath,
+singlecell_metadata_df <- readr::read_tsv(singlecell_metadata_filepath,
                                           show_col_types = FALSE) |>
   dplyr::mutate(sample_accession = title)
 
+smartseq_metadata_df <- singlecell_metadata_df |>
+  dplyr::filter(study == "GSE119926")
+tenx_metadata_df <- singlecell_metadata_df |>
+  dplyr::filter(study == "GSE155446")
+
 baseline_models <- readr::read_rds(baseline_models_filepath)
 official_model <- which(purrr::map_lgl(baseline_models, \(x) x[["official_model"]]))
 
@@ -52,13 +66,22 @@ mb_subgroups <- c("G3", "G4", "SHH", "WNT")
 
 # Predict the subgroup of pseudobulk samples
 
-pseudobulk_test_list <- list("ktsp" = test_ktsp(genex_df_test = pseudobulk_genex_df,
-                                                metadata_df_test = pseudobulk_metadata_df,
+pseudobulk_test_list <- list(
+  "smartseq" = list("ktsp" = test_ktsp(genex_df_test = smartseq_genex_df,
+                                                metadata_df_test = smartseq_metadata_df,
                                                 classifier = classifier_list[["ktsp"]],
                                                 labels = mb_subgroups),
-                             "rf" = test_rf(genex_df_test = pseudobulk_genex_df,
-                                            metadata_df_test = pseudobulk_metadata_df,
-                                            classifier = classifier_list[["rf"]]))
+                             "rf" = test_rf(genex_df_test = smartseq_genex_df,
+                                            metadata_df_test = smartseq_metadata_df,
+                                            classifier = classifier_list[["rf"]])),
+  "tenx" = list("ktsp" = test_ktsp(genex_df_test = tenx_genex_df,
+                                   metadata_df_test = tenx_metadata_df,
+                                   classifier = classifier_list[["ktsp"]],
+                                   labels = mb_subgroups),
+                "rf" = test_rf(genex_df_test = tenx_genex_df,
+                               metadata_df_test = tenx_metadata_df,
+                               classifier = classifier_list[["rf"]]))
+)
 
 # Prep pseudobulk plot data
 

From 7d7a57f40e5d60e51e29328f391275df6a76b75c Mon Sep 17 00:00:00 2001
From: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com>
Date: Sun, 8 Dec 2024 12:43:50 -0500
Subject: [PATCH 3/6] Finish adding 10x data to single-cell prediction

---
 predict/predict_pseudobulk_and_single_cells.R | 96 +++++++++++--------
 1 file changed, 57 insertions(+), 39 deletions(-)

diff --git a/predict/predict_pseudobulk_and_single_cells.R b/predict/predict_pseudobulk_and_single_cells.R
index 0e0627a..8ae90e1 100644
--- a/predict/predict_pseudobulk_and_single_cells.R
+++ b/predict/predict_pseudobulk_and_single_cells.R
@@ -38,27 +38,43 @@ baseline_models_filepath <- here::here(models_dir, "baseline.rds")
 single_cell_plot_data_filepath <- here::here(plots_data_dir, "single_cell_test_predictions.tsv")
 pseudobulk_plot_data_filepath <- here::here(plots_data_dir, "pseudobulk_test_predictions.tsv")
 
-# Read in data
+# Get file paths of individual SingleCellExperiment RDS objects
+sce_files <- c(fs::dir_ls(path = fs::path(smartseq_data_dir,
+                                          "pseudobulk_sce"),
+                          glob = "*_sce.rds"),
+               fs::dir_ls(path = fs::path(tenx_data_dir,
+                                          "pseudobulk_sce"),
+                          glob = "*_sce.rds")
+)
+# Name the vector using the sample title
+names(sce_files) <- stringr::word(names(sce_files), start = -1, sep = "/") |>
+  stringr::str_remove_all("\\_sce.rds")
 
+# Read in data
+# Smart-Seq2 pseudobulk data
 smartseq_genex_df <- readr::read_tsv(smartseq_genex_filepath,
                                        show_col_types = FALSE) |>
   tibble::column_to_rownames(var = "gene")
+
+# 10x pseudobulk data
 tenx_genex_df <- readr::read_tsv(tenx_genex_filepath,
                                  show_col_types = FALSE) |>
   tibble::column_to_rownames(var = "gene")
 
+# Metadata
 singlecell_metadata_df <- readr::read_tsv(singlecell_metadata_filepath,
                                           show_col_types = FALSE) |>
   dplyr::mutate(sample_accession = title)
 
+# Split up by study to pass to test_*()
 smartseq_metadata_df <- singlecell_metadata_df |>
   dplyr::filter(study == "GSE119926")
 tenx_metadata_df <- singlecell_metadata_df |>
   dplyr::filter(study == "GSE155446")
 
+# Read in and prepare models
 baseline_models <- readr::read_rds(baseline_models_filepath)
 official_model <- which(purrr::map_lgl(baseline_models, \(x) x[["official_model"]]))
-
 classifier_list <- list(ktsp = baseline_models[[official_model]]$ktsp_weighted$classifier,
                         rf = baseline_models[[official_model]]$rf_weighted$classifier)
 
@@ -67,38 +83,43 @@ mb_subgroups <- c("G3", "G4", "SHH", "WNT")
 # Predict the subgroup of pseudobulk samples
 
 pseudobulk_test_list <- list(
-  "smartseq" = list("ktsp" = test_ktsp(genex_df_test = smartseq_genex_df,
-                                                metadata_df_test = smartseq_metadata_df,
-                                                classifier = classifier_list[["ktsp"]],
-                                                labels = mb_subgroups),
-                             "rf" = test_rf(genex_df_test = smartseq_genex_df,
-                                            metadata_df_test = smartseq_metadata_df,
-                                            classifier = classifier_list[["rf"]])),
-  "tenx" = list("ktsp" = test_ktsp(genex_df_test = tenx_genex_df,
-                                   metadata_df_test = tenx_metadata_df,
-                                   classifier = classifier_list[["ktsp"]],
-                                   labels = mb_subgroups),
-                "rf" = test_rf(genex_df_test = tenx_genex_df,
-                               metadata_df_test = tenx_metadata_df,
-                               classifier = classifier_list[["rf"]]))
+  "GSE119926" = list("ktsp" = test_ktsp(genex_df_test = smartseq_genex_df,
+                                        metadata_df_test = smartseq_metadata_df,
+                                        classifier = classifier_list[["ktsp"]],
+                                        labels = mb_subgroups),
+                     "rf" = test_rf(genex_df_test = smartseq_genex_df,
+                                    metadata_df_test = smartseq_metadata_df,
+                                    classifier = classifier_list[["rf"]])),
+  "GSE155446" = list("ktsp" = test_ktsp(genex_df_test = tenx_genex_df,
+                                        metadata_df_test = tenx_metadata_df,
+                                        classifier = classifier_list[["ktsp"]],
+                                        labels = mb_subgroups),
+                     "rf" = test_rf(genex_df_test = tenx_genex_df,
+                                    metadata_df_test = tenx_metadata_df,
+                                    classifier = classifier_list[["rf"]]))
 )
 
 # Prep pseudobulk plot data
 
-pseudobulk_plot_df <- purrr::map2(pseudobulk_test_list, # list of test objects and
-                                  names(pseudobulk_test_list), # their classifier model types
-                                  \(pseudobulk_test, model_type) pseudobulk_test$model_output |>
-                                    as.data.frame() |>
-                                    tibble::rownames_to_column(var = "sample_accession") |>
-                                    tibble::as_tibble() |>
-                                    dplyr::select(sample_accession,
-                                                  G3,
-                                                  G4,
-                                                  SHH,
-                                                  WNT) |>
-                                    dplyr::mutate(model_type = model_type)) |>
+pseudobulk_plot_df <- purrr::map2(pseudobulk_test_list,  # each dataset
+                                  names(pseudobulk_test_list),  # and series accession
+                                  \(singlecell_dataset, study)
+                                  purrr::map2(singlecell_dataset, # list of test objects and
+                                              names(singlecell_dataset), # their classifier model types
+                                              \(pseudobulk_test, model_type) pseudobulk_test$model_output |>
+                                                as.data.frame() |>
+                                                tibble::rownames_to_column(var = "sample_accession") |>
+                                                tibble::as_tibble() |>
+                                                dplyr::select(sample_accession,
+                                                              G3,
+                                                              G4,
+                                                              SHH,
+                                                              WNT) |>
+                                                dplyr::mutate(model_type = model_type,
+                                                              study = study))) |>
+  purrr::flatten_df() |>
   dplyr::bind_rows() |>
-  dplyr::left_join(pseudobulk_metadata_df |>
+  dplyr::left_join(singlecell_metadata_df |>
                      dplyr::select(sample_accession,
                                    subgroup),
                    by = "sample_accession") |>
@@ -112,24 +133,21 @@ readr::write_tsv(x = pseudobulk_plot_df,
 # Predict the subgroup of single cells
 
 model_test_list <- purrr::map(names(classifier_list), # classifier model types
-                              \(model_type) purrr::map(pseudobulk_metadata_df$title,
+                              \(model_type) purrr::map(singlecell_metadata_df$title,
                                                        \(x) test_single_cells(sample_acc = x,
-                                                                              sce_filepath = here::here(pseudobulk_sce_dir,
-                                                                                                        stringr::str_c(x, "_sce.rds")),
-                                                                              metadata_df = pseudobulk_metadata_df,
+                                                                              sce_filepath = sce_files[x],
+                                                                              metadata_df = singlecell_metadata_df,
                                                                               labels = mb_subgroups,
                                                                               classifier = classifier_list[[model_type]],
-                                                                              study = "GSE119926",
                                                                               platform = "scRNA-seq")) |>
-                                purrr::set_names(pseudobulk_metadata_df$title)) |>
+                                purrr::set_names(singlecell_metadata_df$title)) |>
   purrr::set_names(names(classifier_list))
 
 # Prep single cell plot data
 
 get_umap_coord_and_cluster <- function(sample_title) {
 
-  sce_filepath <- here::here(pseudobulk_sce_dir,
-                             stringr::str_c(sample_title, "_sce.rds"))
+  sce_filepath <- sce_files[sample_title]
 
   sce_object <- readr::read_rds(sce_filepath)
 
@@ -145,7 +163,7 @@ get_umap_coord_and_cluster <- function(sample_title) {
 
 }
 
-umap_list <- purrr::map(pseudobulk_metadata_df$title,
+umap_list <- purrr::map(singlecell_metadata_df$title,
                         \(sample_title) get_umap_coord_and_cluster(sample_title)) |>
   dplyr::bind_rows()
 
@@ -169,7 +187,7 @@ single_cell_plot_df <- purrr::map2(model_test_list, # list of test objects and
   dplyr::ungroup() |>
   dplyr::left_join(umap_list,
                    by = c("sample_accession", "cell_index")) |>
-  dplyr::left_join(pseudobulk_metadata_df |>
+  dplyr::left_join(singlecell_metadata_df |>
                      dplyr::select(sample_accession,
                                    study,
                                    subgroup,

From 4456ec38a6a82ab1b0e03900e467a8faeef61792 Mon Sep 17 00:00:00 2001
From: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com>
Date: Fri, 13 Dec 2024 14:37:45 -0500
Subject: [PATCH 4/6] Change GP3/4 from NA to G3/G4 in subgroup cleaning

---
 01-gather_metadata.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/01-gather_metadata.R b/01-gather_metadata.R
index dc9a1b5..d490127 100644
--- a/01-gather_metadata.R
+++ b/01-gather_metadata.R
@@ -36,7 +36,7 @@ clean_mb_subgroups <- function(df){
                                               subgroup %in% c("NORM", "n/a (NORM)") ~ "Normal",
                                               subgroup %in% c("B", "MB_SHH", "SHH_alpha", "SHH_beta", "SHH_delta", "SHH_gamma", "SHH-infant", "SHH-adult") ~ "SHH",
                                               subgroup %in% c("A", "MB_WNT", "WNT_alpha", "WNT_beta") ~ "WNT",
-                                              subgroup %in% c("GP3/4") ~ NA,
+                                              subgroup %in% c("GP3/4") ~ "G3/G4",
                                               .default = subgroup))
 
   return(df)

From f27846852e047a7e040b3847af8070e12ebf1398 Mon Sep 17 00:00:00 2001
From: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com>
Date: Fri, 13 Dec 2024 14:45:17 -0500
Subject: [PATCH 5/6] Don't filter out NA subgroup for 10x data

---
 01-gather_metadata.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/01-gather_metadata.R b/01-gather_metadata.R
index d490127..94e70b4 100644
--- a/01-gather_metadata.R
+++ b/01-gather_metadata.R
@@ -239,8 +239,7 @@ GSE155446_metadata <- readr::read_csv(GSE155446_metadata_input_filename,
                 subtype = NA) |>
   dplyr::filter(sample_accession != "966-recurrence",
                 sample_accession != "934-repeat") |>
-  clean_mb_subgroups() |>
-  dplyr::filter(!is.na(subgroup))
+  clean_mb_subgroups()
 
 ################################################################################
 # combine bulk metadata and write to file

From 28bddb4ef21fada2089558cc2d1d3715870f493e Mon Sep 17 00:00:00 2001
From: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:18:27 -0500
Subject: [PATCH 6/6] Throw an error if duplicate sample titles for SCEs

---
 predict/predict_pseudobulk_and_single_cells.R | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/predict/predict_pseudobulk_and_single_cells.R b/predict/predict_pseudobulk_and_single_cells.R
index 8ae90e1..1bc17f0 100644
--- a/predict/predict_pseudobulk_and_single_cells.R
+++ b/predict/predict_pseudobulk_and_single_cells.R
@@ -46,10 +46,22 @@ sce_files <- c(fs::dir_ls(path = fs::path(smartseq_data_dir,
                                           "pseudobulk_sce"),
                           glob = "*_sce.rds")
 )
-# Name the vector using the sample title
-names(sce_files) <- stringr::word(names(sce_files), start = -1, sep = "/") |>
+# Extract sample titles from the file names
+sce_sample_titles <- stringr::word(names(sce_files), start = -1, sep = "/") |>
   stringr::str_remove_all("\\_sce.rds")
 
+# Fail if there are any sample title collisions
+if (any(duplicated(sce_sample_titles))) {
+
+  stop("Duplicate single-cell sample title detected!")
+
+} else {
+
+  # Name the vector using the sample title
+  names(sce_files) <- sce_sample_titles
+
+}
+
 # Read in data
 # Smart-Seq2 pseudobulk data
 smartseq_genex_df <- readr::read_tsv(smartseq_genex_filepath,