From 12675eadf335b110cf067124470af63c5e8a2ef1 Mon Sep 17 00:00:00 2001 From: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com> Date: Fri, 6 Dec 2024 09:32:15 -0500 Subject: [PATCH 1/6] Filter out 10x samples without subgroup label in gather metadata --- 01-gather_metadata.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/01-gather_metadata.R b/01-gather_metadata.R index a4291a6..dc9a1b5 100644 --- a/01-gather_metadata.R +++ b/01-gather_metadata.R @@ -223,6 +223,8 @@ GSE119926_metadata <- GEOquery::getGEO(filename = GSE119926_metadata_input_filen # Metadata sample 966-recurrence likely matches data file 966-2.tsv # Metadata sample 934-repeat MAY match data file 943.tsv, since that's the only remaining non-matching file # We remove them here to be conservative. +# We also remove any samples that have NA for a subgroup, as we can not +# test models without a subgroup label. GSE155446_metadata <- readr::read_csv(GSE155446_metadata_input_filename, col_types = "c") |> @@ -237,7 +239,8 @@ GSE155446_metadata <- readr::read_csv(GSE155446_metadata_input_filename, subtype = NA) |> dplyr::filter(sample_accession != "966-recurrence", sample_accession != "934-repeat") |> - clean_mb_subgroups() + clean_mb_subgroups() |> + dplyr::filter(!is.na(subgroup)) ################################################################################ # combine bulk metadata and write to file From ec88cf9dcaae092d307921a053943fd10306b9e7 Mon Sep 17 00:00:00 2001 From: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com> Date: Fri, 6 Dec 2024 14:39:50 -0500 Subject: [PATCH 2/6] WIP: Add 10X to prediction --- predict/predict_pseudobulk_and_single_cells.R | 43 ++++++++++++++----- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/predict/predict_pseudobulk_and_single_cells.R b/predict/predict_pseudobulk_and_single_cells.R index 477789f..0e0627a 100644 --- a/predict/predict_pseudobulk_and_single_cells.R +++ b/predict/predict_pseudobulk_and_single_cells.R @@ -16,12 +16,18 @@ models_dir <- here::here("models") plots_dir <- here::here("plots") plots_data_dir <- here::here(plots_dir, "data") processed_data_dir <- here::here("processed_data") -pseudobulk_sce_dir <- here::here(processed_data_dir, "pseudobulk_sce") +pseudobulk_data_dir <- here::here(processed_data_dir, "pseudobulk") +smartseq_data_dir <- here::here(pseudobulk_data_dir, "GSE119926") +tenx_data_dir <- here::here(pseudobulk_data_dir, "GSE155446") # Input files # pseudobulk data for each sample was generated in 02-gather_data.R -pseudobulk_genex_filepath <- here::here(processed_data_dir, "pseudobulk_genex.tsv") -pseudobulk_metadata_filepath <- here::here(processed_data_dir, "pseudobulk_metadata.tsv") +singlecell_metadata_filepath <- here::here(pseudobulk_data_dir, + "pseudobulk_metadata.tsv") +smartseq_genex_filepath <- here::here(smartseq_data_dir, + "GSE119926_pseudobulk_genex.tsv") +tenx_genex_filepath <- here::here(tenx_data_dir, + "GSE155446_pseudobulk_genex.tsv") # baseline models are generated prior to this in the notebook analysis_notebooks/baseline_models.Rmd # there is one baseline model designated as the official model that is used to test single cells here @@ -34,14 +40,22 @@ pseudobulk_plot_data_filepath <- here::here(plots_data_dir, "pseudobulk_test_pre # Read in data -pseudobulk_genex_df <- readr::read_tsv(pseudobulk_genex_filepath, +smartseq_genex_df <- readr::read_tsv(smartseq_genex_filepath, show_col_types = FALSE) |> tibble::column_to_rownames(var = "gene") +tenx_genex_df <- readr::read_tsv(tenx_genex_filepath, + show_col_types = FALSE) |> + tibble::column_to_rownames(var = "gene") -pseudobulk_metadata_df <- readr::read_tsv(pseudobulk_metadata_filepath, +singlecell_metadata_df <- readr::read_tsv(singlecell_metadata_filepath, show_col_types = FALSE) |> dplyr::mutate(sample_accession = title) +smartseq_metadata_df <- singlecell_metadata_df |> + dplyr::filter(study == "GSE119926") +tenx_metadata_df <- singlecell_metadata_df |> + dplyr::filter(study == "GSE155446") + baseline_models <- readr::read_rds(baseline_models_filepath) official_model <- which(purrr::map_lgl(baseline_models, \(x) x[["official_model"]])) @@ -52,13 +66,22 @@ mb_subgroups <- c("G3", "G4", "SHH", "WNT") # Predict the subgroup of pseudobulk samples -pseudobulk_test_list <- list("ktsp" = test_ktsp(genex_df_test = pseudobulk_genex_df, - metadata_df_test = pseudobulk_metadata_df, +pseudobulk_test_list <- list( + "smartseq" = list("ktsp" = test_ktsp(genex_df_test = smartseq_genex_df, + metadata_df_test = smartseq_metadata_df, classifier = classifier_list[["ktsp"]], labels = mb_subgroups), - "rf" = test_rf(genex_df_test = pseudobulk_genex_df, - metadata_df_test = pseudobulk_metadata_df, - classifier = classifier_list[["rf"]])) + "rf" = test_rf(genex_df_test = smartseq_genex_df, + metadata_df_test = smartseq_metadata_df, + classifier = classifier_list[["rf"]])), + "tenx" = list("ktsp" = test_ktsp(genex_df_test = tenx_genex_df, + metadata_df_test = tenx_metadata_df, + classifier = classifier_list[["ktsp"]], + labels = mb_subgroups), + "rf" = test_rf(genex_df_test = tenx_genex_df, + metadata_df_test = tenx_metadata_df, + classifier = classifier_list[["rf"]])) +) # Prep pseudobulk plot data From 7d7a57f40e5d60e51e29328f391275df6a76b75c Mon Sep 17 00:00:00 2001 From: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com> Date: Sun, 8 Dec 2024 12:43:50 -0500 Subject: [PATCH 3/6] Finish adding 10x data to single-cell prediction --- predict/predict_pseudobulk_and_single_cells.R | 96 +++++++++++-------- 1 file changed, 57 insertions(+), 39 deletions(-) diff --git a/predict/predict_pseudobulk_and_single_cells.R b/predict/predict_pseudobulk_and_single_cells.R index 0e0627a..8ae90e1 100644 --- a/predict/predict_pseudobulk_and_single_cells.R +++ b/predict/predict_pseudobulk_and_single_cells.R @@ -38,27 +38,43 @@ baseline_models_filepath <- here::here(models_dir, "baseline.rds") single_cell_plot_data_filepath <- here::here(plots_data_dir, "single_cell_test_predictions.tsv") pseudobulk_plot_data_filepath <- here::here(plots_data_dir, "pseudobulk_test_predictions.tsv") -# Read in data +# Get file paths of individual SingleCellExperiment RDS objects +sce_files <- c(fs::dir_ls(path = fs::path(smartseq_data_dir, + "pseudobulk_sce"), + glob = "*_sce.rds"), + fs::dir_ls(path = fs::path(tenx_data_dir, + "pseudobulk_sce"), + glob = "*_sce.rds") +) +# Name the vector using the sample title +names(sce_files) <- stringr::word(names(sce_files), start = -1, sep = "/") |> + stringr::str_remove_all("\\_sce.rds") +# Read in data +# Smart-Seq2 pseudobulk data smartseq_genex_df <- readr::read_tsv(smartseq_genex_filepath, show_col_types = FALSE) |> tibble::column_to_rownames(var = "gene") + +# 10x pseudobulk data tenx_genex_df <- readr::read_tsv(tenx_genex_filepath, show_col_types = FALSE) |> tibble::column_to_rownames(var = "gene") +# Metadata singlecell_metadata_df <- readr::read_tsv(singlecell_metadata_filepath, show_col_types = FALSE) |> dplyr::mutate(sample_accession = title) +# Split up by study to pass to test_*() smartseq_metadata_df <- singlecell_metadata_df |> dplyr::filter(study == "GSE119926") tenx_metadata_df <- singlecell_metadata_df |> dplyr::filter(study == "GSE155446") +# Read in and prepare models baseline_models <- readr::read_rds(baseline_models_filepath) official_model <- which(purrr::map_lgl(baseline_models, \(x) x[["official_model"]])) - classifier_list <- list(ktsp = baseline_models[[official_model]]$ktsp_weighted$classifier, rf = baseline_models[[official_model]]$rf_weighted$classifier) @@ -67,38 +83,43 @@ mb_subgroups <- c("G3", "G4", "SHH", "WNT") # Predict the subgroup of pseudobulk samples pseudobulk_test_list <- list( - "smartseq" = list("ktsp" = test_ktsp(genex_df_test = smartseq_genex_df, - metadata_df_test = smartseq_metadata_df, - classifier = classifier_list[["ktsp"]], - labels = mb_subgroups), - "rf" = test_rf(genex_df_test = smartseq_genex_df, - metadata_df_test = smartseq_metadata_df, - classifier = classifier_list[["rf"]])), - "tenx" = list("ktsp" = test_ktsp(genex_df_test = tenx_genex_df, - metadata_df_test = tenx_metadata_df, - classifier = classifier_list[["ktsp"]], - labels = mb_subgroups), - "rf" = test_rf(genex_df_test = tenx_genex_df, - metadata_df_test = tenx_metadata_df, - classifier = classifier_list[["rf"]])) + "GSE119926" = list("ktsp" = test_ktsp(genex_df_test = smartseq_genex_df, + metadata_df_test = smartseq_metadata_df, + classifier = classifier_list[["ktsp"]], + labels = mb_subgroups), + "rf" = test_rf(genex_df_test = smartseq_genex_df, + metadata_df_test = smartseq_metadata_df, + classifier = classifier_list[["rf"]])), + "GSE155446" = list("ktsp" = test_ktsp(genex_df_test = tenx_genex_df, + metadata_df_test = tenx_metadata_df, + classifier = classifier_list[["ktsp"]], + labels = mb_subgroups), + "rf" = test_rf(genex_df_test = tenx_genex_df, + metadata_df_test = tenx_metadata_df, + classifier = classifier_list[["rf"]])) ) # Prep pseudobulk plot data -pseudobulk_plot_df <- purrr::map2(pseudobulk_test_list, # list of test objects and - names(pseudobulk_test_list), # their classifier model types - \(pseudobulk_test, model_type) pseudobulk_test$model_output |> - as.data.frame() |> - tibble::rownames_to_column(var = "sample_accession") |> - tibble::as_tibble() |> - dplyr::select(sample_accession, - G3, - G4, - SHH, - WNT) |> - dplyr::mutate(model_type = model_type)) |> +pseudobulk_plot_df <- purrr::map2(pseudobulk_test_list, # each dataset + names(pseudobulk_test_list), # and series accession + \(singlecell_dataset, study) + purrr::map2(singlecell_dataset, # list of test objects and + names(singlecell_dataset), # their classifier model types + \(pseudobulk_test, model_type) pseudobulk_test$model_output |> + as.data.frame() |> + tibble::rownames_to_column(var = "sample_accession") |> + tibble::as_tibble() |> + dplyr::select(sample_accession, + G3, + G4, + SHH, + WNT) |> + dplyr::mutate(model_type = model_type, + study = study))) |> + purrr::flatten_df() |> dplyr::bind_rows() |> - dplyr::left_join(pseudobulk_metadata_df |> + dplyr::left_join(singlecell_metadata_df |> dplyr::select(sample_accession, subgroup), by = "sample_accession") |> @@ -112,24 +133,21 @@ readr::write_tsv(x = pseudobulk_plot_df, # Predict the subgroup of single cells model_test_list <- purrr::map(names(classifier_list), # classifier model types - \(model_type) purrr::map(pseudobulk_metadata_df$title, + \(model_type) purrr::map(singlecell_metadata_df$title, \(x) test_single_cells(sample_acc = x, - sce_filepath = here::here(pseudobulk_sce_dir, - stringr::str_c(x, "_sce.rds")), - metadata_df = pseudobulk_metadata_df, + sce_filepath = sce_files[x], + metadata_df = singlecell_metadata_df, labels = mb_subgroups, classifier = classifier_list[[model_type]], - study = "GSE119926", platform = "scRNA-seq")) |> - purrr::set_names(pseudobulk_metadata_df$title)) |> + purrr::set_names(singlecell_metadata_df$title)) |> purrr::set_names(names(classifier_list)) # Prep single cell plot data get_umap_coord_and_cluster <- function(sample_title) { - sce_filepath <- here::here(pseudobulk_sce_dir, - stringr::str_c(sample_title, "_sce.rds")) + sce_filepath <- sce_files[sample_title] sce_object <- readr::read_rds(sce_filepath) @@ -145,7 +163,7 @@ get_umap_coord_and_cluster <- function(sample_title) { } -umap_list <- purrr::map(pseudobulk_metadata_df$title, +umap_list <- purrr::map(singlecell_metadata_df$title, \(sample_title) get_umap_coord_and_cluster(sample_title)) |> dplyr::bind_rows() @@ -169,7 +187,7 @@ single_cell_plot_df <- purrr::map2(model_test_list, # list of test objects and dplyr::ungroup() |> dplyr::left_join(umap_list, by = c("sample_accession", "cell_index")) |> - dplyr::left_join(pseudobulk_metadata_df |> + dplyr::left_join(singlecell_metadata_df |> dplyr::select(sample_accession, study, subgroup, From 4456ec38a6a82ab1b0e03900e467a8faeef61792 Mon Sep 17 00:00:00 2001 From: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:37:45 -0500 Subject: [PATCH 4/6] Change GP3/4 from NA to G3/G4 in subgroup cleaning --- 01-gather_metadata.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/01-gather_metadata.R b/01-gather_metadata.R index dc9a1b5..d490127 100644 --- a/01-gather_metadata.R +++ b/01-gather_metadata.R @@ -36,7 +36,7 @@ clean_mb_subgroups <- function(df){ subgroup %in% c("NORM", "n/a (NORM)") ~ "Normal", subgroup %in% c("B", "MB_SHH", "SHH_alpha", "SHH_beta", "SHH_delta", "SHH_gamma", "SHH-infant", "SHH-adult") ~ "SHH", subgroup %in% c("A", "MB_WNT", "WNT_alpha", "WNT_beta") ~ "WNT", - subgroup %in% c("GP3/4") ~ NA, + subgroup %in% c("GP3/4") ~ "G3/G4", .default = subgroup)) return(df) From f27846852e047a7e040b3847af8070e12ebf1398 Mon Sep 17 00:00:00 2001 From: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:45:17 -0500 Subject: [PATCH 5/6] Don't filter out NA subgroup for 10x data --- 01-gather_metadata.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/01-gather_metadata.R b/01-gather_metadata.R index d490127..94e70b4 100644 --- a/01-gather_metadata.R +++ b/01-gather_metadata.R @@ -239,8 +239,7 @@ GSE155446_metadata <- readr::read_csv(GSE155446_metadata_input_filename, subtype = NA) |> dplyr::filter(sample_accession != "966-recurrence", sample_accession != "934-repeat") |> - clean_mb_subgroups() |> - dplyr::filter(!is.na(subgroup)) + clean_mb_subgroups() ################################################################################ # combine bulk metadata and write to file From 28bddb4ef21fada2089558cc2d1d3715870f493e Mon Sep 17 00:00:00 2001 From: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:18:27 -0500 Subject: [PATCH 6/6] Throw an error if duplicate sample titles for SCEs --- predict/predict_pseudobulk_and_single_cells.R | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/predict/predict_pseudobulk_and_single_cells.R b/predict/predict_pseudobulk_and_single_cells.R index 8ae90e1..1bc17f0 100644 --- a/predict/predict_pseudobulk_and_single_cells.R +++ b/predict/predict_pseudobulk_and_single_cells.R @@ -46,10 +46,22 @@ sce_files <- c(fs::dir_ls(path = fs::path(smartseq_data_dir, "pseudobulk_sce"), glob = "*_sce.rds") ) -# Name the vector using the sample title -names(sce_files) <- stringr::word(names(sce_files), start = -1, sep = "/") |> +# Extract sample titles from the file names +sce_sample_titles <- stringr::word(names(sce_files), start = -1, sep = "/") |> stringr::str_remove_all("\\_sce.rds") +# Fail if there are any sample title collisions +if (any(duplicated(sce_sample_titles))) { + + stop("Duplicate single-cell sample title detected!") + +} else { + + # Name the vector using the sample title + names(sce_files) <- sce_sample_titles + +} + # Read in data # Smart-Seq2 pseudobulk data smartseq_genex_df <- readr::read_tsv(smartseq_genex_filepath,