Merge pull request #84 from AlexsLemonade/jaclyn-taroni/82-predict-10x

Add 10X dataset to single-cell prediction script
AlexsLemonade · Dec 13, 2024 · 675c60c · 675c60c
2 parents f400ad9 + 28bddb4
commit 675c60c
Show file tree

Hide file tree

Showing 2 changed files with 93 additions and 38 deletions.
diff --git a/01-gather_metadata.R b/01-gather_metadata.R
@@ -36,7 +36,7 @@ clean_mb_subgroups <- function(df){
                                               subgroup %in% c("NORM", "n/a (NORM)") ~ "Normal",
                                               subgroup %in% c("B", "MB_SHH", "SHH_alpha", "SHH_beta", "SHH_delta", "SHH_gamma", "SHH-infant", "SHH-adult") ~ "SHH",
                                               subgroup %in% c("A", "MB_WNT", "WNT_alpha", "WNT_beta") ~ "WNT",
-                                              subgroup %in% c("GP3/4") ~ NA,
+                                              subgroup %in% c("GP3/4") ~ "G3/G4",
                                               .default = subgroup))
 
   return(df)
@@ -223,6 +223,8 @@ GSE119926_metadata <- GEOquery::getGEO(filename = GSE119926_metadata_input_filen
 # Metadata sample 966-recurrence likely matches data file 966-2.tsv
 # Metadata sample 934-repeat MAY match data file 943.tsv, since that's the only remaining non-matching file
 # We remove them here to be conservative.
+# We also remove any samples that have NA for a subgroup, as we can not
+# test models without a subgroup label.
 
 GSE155446_metadata <- readr::read_csv(GSE155446_metadata_input_filename,
                                       col_types = "c") |>

diff --git a/predict/predict_pseudobulk_and_single_cells.R b/predict/predict_pseudobulk_and_single_cells.R
@@ -16,12 +16,18 @@ models_dir <- here::here("models")
 plots_dir <- here::here("plots")
 plots_data_dir <- here::here(plots_dir, "data")
 processed_data_dir <- here::here("processed_data")
-pseudobulk_sce_dir <- here::here(processed_data_dir, "pseudobulk_sce")
+pseudobulk_data_dir <- here::here(processed_data_dir, "pseudobulk")
+smartseq_data_dir <- here::here(pseudobulk_data_dir, "GSE119926")
+tenx_data_dir <- here::here(pseudobulk_data_dir, "GSE155446")
 
 # Input files
 # pseudobulk data for each sample was generated in 02-gather_data.R
-pseudobulk_genex_filepath <- here::here(processed_data_dir, "pseudobulk_genex.tsv")
-pseudobulk_metadata_filepath <- here::here(processed_data_dir, "pseudobulk_metadata.tsv")
+singlecell_metadata_filepath <- here::here(pseudobulk_data_dir,
+                                           "pseudobulk_metadata.tsv")
+smartseq_genex_filepath <- here::here(smartseq_data_dir,
+                                      "GSE119926_pseudobulk_genex.tsv")
+tenx_genex_filepath <- here::here(tenx_data_dir,
+                                  "GSE155446_pseudobulk_genex.tsv")
 
 # baseline models are generated prior to this in the notebook analysis_notebooks/baseline_models.Rmd
 # there is one baseline model designated as the official model that is used to test single cells here
@@ -32,50 +38,100 @@ baseline_models_filepath <- here::here(models_dir, "baseline.rds")
 single_cell_plot_data_filepath <- here::here(plots_data_dir, "single_cell_test_predictions.tsv")
 pseudobulk_plot_data_filepath <- here::here(plots_data_dir, "pseudobulk_test_predictions.tsv")
 
-# Read in data
+# Get file paths of individual SingleCellExperiment RDS objects
+sce_files <- c(fs::dir_ls(path = fs::path(smartseq_data_dir,
+                                          "pseudobulk_sce"),
+                          glob = "*_sce.rds"),
+               fs::dir_ls(path = fs::path(tenx_data_dir,
+                                          "pseudobulk_sce"),
+                          glob = "*_sce.rds")
+)
+# Extract sample titles from the file names
+sce_sample_titles <- stringr::word(names(sce_files), start = -1, sep = "/") |>
+  stringr::str_remove_all("\\_sce.rds")
+
+# Fail if there are any sample title collisions
+if (any(duplicated(sce_sample_titles))) {
+
+  stop("Duplicate single-cell sample title detected!")
+
+} else {
+
+  # Name the vector using the sample title
+  names(sce_files) <- sce_sample_titles
+
+}
 
-pseudobulk_genex_df <- readr::read_tsv(pseudobulk_genex_filepath,
+# Read in data
+# Smart-Seq2 pseudobulk data
+smartseq_genex_df <- readr::read_tsv(smartseq_genex_filepath,
                                        show_col_types = FALSE) |>
   tibble::column_to_rownames(var = "gene")
 
-pseudobulk_metadata_df <- readr::read_tsv(pseudobulk_metadata_filepath,
+# 10x pseudobulk data
+tenx_genex_df <- readr::read_tsv(tenx_genex_filepath,
+                                 show_col_types = FALSE) |>
+  tibble::column_to_rownames(var = "gene")
+
+# Metadata
+singlecell_metadata_df <- readr::read_tsv(singlecell_metadata_filepath,
                                           show_col_types = FALSE) |>
   dplyr::mutate(sample_accession = title)
 
+# Split up by study to pass to test_*()
+smartseq_metadata_df <- singlecell_metadata_df |>
+  dplyr::filter(study == "GSE119926")
+tenx_metadata_df <- singlecell_metadata_df |>
+  dplyr::filter(study == "GSE155446")
+
+# Read in and prepare models
 baseline_models <- readr::read_rds(baseline_models_filepath)
 official_model <- which(purrr::map_lgl(baseline_models, \(x) x[["official_model"]]))
-
 classifier_list <- list(ktsp = baseline_models[[official_model]]$ktsp_weighted$classifier,
                         rf = baseline_models[[official_model]]$rf_weighted$classifier)
 
 mb_subgroups <- c("G3", "G4", "SHH", "WNT")
 
 # Predict the subgroup of pseudobulk samples
 
-pseudobulk_test_list <- list("ktsp" = test_ktsp(genex_df_test = pseudobulk_genex_df,
-                                                metadata_df_test = pseudobulk_metadata_df,
-                                                classifier = classifier_list[["ktsp"]],
-                                                labels = mb_subgroups),
-                             "rf" = test_rf(genex_df_test = pseudobulk_genex_df,
-                                            metadata_df_test = pseudobulk_metadata_df,
-                                            classifier = classifier_list[["rf"]]))
+pseudobulk_test_list <- list(
+  "GSE119926" = list("ktsp" = test_ktsp(genex_df_test = smartseq_genex_df,
+                                        metadata_df_test = smartseq_metadata_df,
+                                        classifier = classifier_list[["ktsp"]],
+                                        labels = mb_subgroups),
+                     "rf" = test_rf(genex_df_test = smartseq_genex_df,
+                                    metadata_df_test = smartseq_metadata_df,
+                                    classifier = classifier_list[["rf"]])),
+  "GSE155446" = list("ktsp" = test_ktsp(genex_df_test = tenx_genex_df,
+                                        metadata_df_test = tenx_metadata_df,
+                                        classifier = classifier_list[["ktsp"]],
+                                        labels = mb_subgroups),
+                     "rf" = test_rf(genex_df_test = tenx_genex_df,
+                                    metadata_df_test = tenx_metadata_df,
+                                    classifier = classifier_list[["rf"]]))
+)
 
 # Prep pseudobulk plot data
 
-pseudobulk_plot_df <- purrr::map2(pseudobulk_test_list, # list of test objects and
-                                  names(pseudobulk_test_list), # their classifier model types
-                                  \(pseudobulk_test, model_type) pseudobulk_test$model_output |>
-                                    as.data.frame() |>
-                                    tibble::rownames_to_column(var = "sample_accession") |>
-                                    tibble::as_tibble() |>
-                                    dplyr::select(sample_accession,
-                                                  G3,
-                                                  G4,
-                                                  SHH,
-                                                  WNT) |>
-                                    dplyr::mutate(model_type = model_type)) |>
+pseudobulk_plot_df <- purrr::map2(pseudobulk_test_list,  # each dataset
+                                  names(pseudobulk_test_list),  # and series accession
+                                  \(singlecell_dataset, study)
+                                  purrr::map2(singlecell_dataset, # list of test objects and
+                                              names(singlecell_dataset), # their classifier model types
+                                              \(pseudobulk_test, model_type) pseudobulk_test$model_output |>
+                                                as.data.frame() |>
+                                                tibble::rownames_to_column(var = "sample_accession") |>
+                                                tibble::as_tibble() |>
+                                                dplyr::select(sample_accession,
+                                                              G3,
+                                                              G4,
+                                                              SHH,
+                                                              WNT) |>
+                                                dplyr::mutate(model_type = model_type,
+                                                              study = study))) |>
+  purrr::flatten_df() |>
   dplyr::bind_rows() |>
-  dplyr::left_join(pseudobulk_metadata_df |>
+  dplyr::left_join(singlecell_metadata_df |>
                      dplyr::select(sample_accession,
                                    subgroup),
                    by = "sample_accession") |>
@@ -89,24 +145,21 @@ readr::write_tsv(x = pseudobulk_plot_df,
 # Predict the subgroup of single cells
 
 model_test_list <- purrr::map(names(classifier_list), # classifier model types
-                              \(model_type) purrr::map(pseudobulk_metadata_df$title,
+                              \(model_type) purrr::map(singlecell_metadata_df$title,
                                                        \(x) test_single_cells(sample_acc = x,
-                                                                              sce_filepath = here::here(pseudobulk_sce_dir,
-                                                                                                        stringr::str_c(x, "_sce.rds")),
-                                                                              metadata_df = pseudobulk_metadata_df,
+                                                                              sce_filepath = sce_files[x],
+                                                                              metadata_df = singlecell_metadata_df,
                                                                               labels = mb_subgroups,
                                                                               classifier = classifier_list[[model_type]],
-                                                                              study = "GSE119926",
                                                                               platform = "scRNA-seq")) |>
-                                purrr::set_names(pseudobulk_metadata_df$title)) |>
+                                purrr::set_names(singlecell_metadata_df$title)) |>
   purrr::set_names(names(classifier_list))
 
 # Prep single cell plot data
 
 get_umap_coord_and_cluster <- function(sample_title) {
 
-  sce_filepath <- here::here(pseudobulk_sce_dir,
-                             stringr::str_c(sample_title, "_sce.rds"))
+  sce_filepath <- sce_files[sample_title]
 
   sce_object <- readr::read_rds(sce_filepath)
 
@@ -122,7 +175,7 @@ get_umap_coord_and_cluster <- function(sample_title) {
 
 }
 
-umap_list <- purrr::map(pseudobulk_metadata_df$title,
+umap_list <- purrr::map(singlecell_metadata_df$title,
                         \(sample_title) get_umap_coord_and_cluster(sample_title)) |>
   dplyr::bind_rows()
 
@@ -146,7 +199,7 @@ single_cell_plot_df <- purrr::map2(model_test_list, # list of test objects and
   dplyr::ungroup() |>
   dplyr::left_join(umap_list,
                    by = c("sample_accession", "cell_index")) |>
-  dplyr::left_join(pseudobulk_metadata_df |>
+  dplyr::left_join(singlecell_metadata_df |>
                      dplyr::select(sample_accession,
                                    study,
                                    subgroup,