Skip to content

Commit

Permalink
Merge pull request #84 from AlexsLemonade/jaclyn-taroni/82-predict-10x
Browse files Browse the repository at this point in the history
Add 10X dataset to single-cell prediction script
  • Loading branch information
jaclyn-taroni authored Dec 13, 2024
2 parents f400ad9 + 28bddb4 commit 675c60c
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 38 deletions.
4 changes: 3 additions & 1 deletion 01-gather_metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ clean_mb_subgroups <- function(df){
subgroup %in% c("NORM", "n/a (NORM)") ~ "Normal",
subgroup %in% c("B", "MB_SHH", "SHH_alpha", "SHH_beta", "SHH_delta", "SHH_gamma", "SHH-infant", "SHH-adult") ~ "SHH",
subgroup %in% c("A", "MB_WNT", "WNT_alpha", "WNT_beta") ~ "WNT",
subgroup %in% c("GP3/4") ~ NA,
subgroup %in% c("GP3/4") ~ "G3/G4",
.default = subgroup))

return(df)
Expand Down Expand Up @@ -223,6 +223,8 @@ GSE119926_metadata <- GEOquery::getGEO(filename = GSE119926_metadata_input_filen
# Metadata sample 966-recurrence likely matches data file 966-2.tsv
# Metadata sample 934-repeat MAY match data file 943.tsv, since that's the only remaining non-matching file
# We remove them here to be conservative.
# We also remove any samples that have NA for a subgroup, as we can not
# test models without a subgroup label.

GSE155446_metadata <- readr::read_csv(GSE155446_metadata_input_filename,
col_types = "c") |>
Expand Down
127 changes: 90 additions & 37 deletions predict/predict_pseudobulk_and_single_cells.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,18 @@ models_dir <- here::here("models")
plots_dir <- here::here("plots")
plots_data_dir <- here::here(plots_dir, "data")
processed_data_dir <- here::here("processed_data")
pseudobulk_sce_dir <- here::here(processed_data_dir, "pseudobulk_sce")
pseudobulk_data_dir <- here::here(processed_data_dir, "pseudobulk")
smartseq_data_dir <- here::here(pseudobulk_data_dir, "GSE119926")
tenx_data_dir <- here::here(pseudobulk_data_dir, "GSE155446")

# Input files
# pseudobulk data for each sample was generated in 02-gather_data.R
pseudobulk_genex_filepath <- here::here(processed_data_dir, "pseudobulk_genex.tsv")
pseudobulk_metadata_filepath <- here::here(processed_data_dir, "pseudobulk_metadata.tsv")
singlecell_metadata_filepath <- here::here(pseudobulk_data_dir,
"pseudobulk_metadata.tsv")
smartseq_genex_filepath <- here::here(smartseq_data_dir,
"GSE119926_pseudobulk_genex.tsv")
tenx_genex_filepath <- here::here(tenx_data_dir,
"GSE155446_pseudobulk_genex.tsv")

# baseline models are generated prior to this in the notebook analysis_notebooks/baseline_models.Rmd
# there is one baseline model designated as the official model that is used to test single cells here
Expand All @@ -32,50 +38,100 @@ baseline_models_filepath <- here::here(models_dir, "baseline.rds")
single_cell_plot_data_filepath <- here::here(plots_data_dir, "single_cell_test_predictions.tsv")
pseudobulk_plot_data_filepath <- here::here(plots_data_dir, "pseudobulk_test_predictions.tsv")

# Read in data
# Get file paths of individual SingleCellExperiment RDS objects
sce_files <- c(fs::dir_ls(path = fs::path(smartseq_data_dir,
"pseudobulk_sce"),
glob = "*_sce.rds"),
fs::dir_ls(path = fs::path(tenx_data_dir,
"pseudobulk_sce"),
glob = "*_sce.rds")
)
# Extract sample titles from the file names
sce_sample_titles <- stringr::word(names(sce_files), start = -1, sep = "/") |>
stringr::str_remove_all("\\_sce.rds")

# Fail if there are any sample title collisions
if (any(duplicated(sce_sample_titles))) {

stop("Duplicate single-cell sample title detected!")

} else {

# Name the vector using the sample title
names(sce_files) <- sce_sample_titles

}

pseudobulk_genex_df <- readr::read_tsv(pseudobulk_genex_filepath,
# Read in data
# Smart-Seq2 pseudobulk data
smartseq_genex_df <- readr::read_tsv(smartseq_genex_filepath,
show_col_types = FALSE) |>
tibble::column_to_rownames(var = "gene")

pseudobulk_metadata_df <- readr::read_tsv(pseudobulk_metadata_filepath,
# 10x pseudobulk data
tenx_genex_df <- readr::read_tsv(tenx_genex_filepath,
show_col_types = FALSE) |>
tibble::column_to_rownames(var = "gene")

# Metadata
singlecell_metadata_df <- readr::read_tsv(singlecell_metadata_filepath,
show_col_types = FALSE) |>
dplyr::mutate(sample_accession = title)

# Split up by study to pass to test_*()
smartseq_metadata_df <- singlecell_metadata_df |>
dplyr::filter(study == "GSE119926")
tenx_metadata_df <- singlecell_metadata_df |>
dplyr::filter(study == "GSE155446")

# Read in and prepare models
baseline_models <- readr::read_rds(baseline_models_filepath)
official_model <- which(purrr::map_lgl(baseline_models, \(x) x[["official_model"]]))

classifier_list <- list(ktsp = baseline_models[[official_model]]$ktsp_weighted$classifier,
rf = baseline_models[[official_model]]$rf_weighted$classifier)

mb_subgroups <- c("G3", "G4", "SHH", "WNT")

# Predict the subgroup of pseudobulk samples

pseudobulk_test_list <- list("ktsp" = test_ktsp(genex_df_test = pseudobulk_genex_df,
metadata_df_test = pseudobulk_metadata_df,
classifier = classifier_list[["ktsp"]],
labels = mb_subgroups),
"rf" = test_rf(genex_df_test = pseudobulk_genex_df,
metadata_df_test = pseudobulk_metadata_df,
classifier = classifier_list[["rf"]]))
pseudobulk_test_list <- list(
"GSE119926" = list("ktsp" = test_ktsp(genex_df_test = smartseq_genex_df,
metadata_df_test = smartseq_metadata_df,
classifier = classifier_list[["ktsp"]],
labels = mb_subgroups),
"rf" = test_rf(genex_df_test = smartseq_genex_df,
metadata_df_test = smartseq_metadata_df,
classifier = classifier_list[["rf"]])),
"GSE155446" = list("ktsp" = test_ktsp(genex_df_test = tenx_genex_df,
metadata_df_test = tenx_metadata_df,
classifier = classifier_list[["ktsp"]],
labels = mb_subgroups),
"rf" = test_rf(genex_df_test = tenx_genex_df,
metadata_df_test = tenx_metadata_df,
classifier = classifier_list[["rf"]]))
)

# Prep pseudobulk plot data

pseudobulk_plot_df <- purrr::map2(pseudobulk_test_list, # list of test objects and
names(pseudobulk_test_list), # their classifier model types
\(pseudobulk_test, model_type) pseudobulk_test$model_output |>
as.data.frame() |>
tibble::rownames_to_column(var = "sample_accession") |>
tibble::as_tibble() |>
dplyr::select(sample_accession,
G3,
G4,
SHH,
WNT) |>
dplyr::mutate(model_type = model_type)) |>
pseudobulk_plot_df <- purrr::map2(pseudobulk_test_list, # each dataset
names(pseudobulk_test_list), # and series accession
\(singlecell_dataset, study)
purrr::map2(singlecell_dataset, # list of test objects and
names(singlecell_dataset), # their classifier model types
\(pseudobulk_test, model_type) pseudobulk_test$model_output |>
as.data.frame() |>
tibble::rownames_to_column(var = "sample_accession") |>
tibble::as_tibble() |>
dplyr::select(sample_accession,
G3,
G4,
SHH,
WNT) |>
dplyr::mutate(model_type = model_type,
study = study))) |>
purrr::flatten_df() |>
dplyr::bind_rows() |>
dplyr::left_join(pseudobulk_metadata_df |>
dplyr::left_join(singlecell_metadata_df |>
dplyr::select(sample_accession,
subgroup),
by = "sample_accession") |>
Expand All @@ -89,24 +145,21 @@ readr::write_tsv(x = pseudobulk_plot_df,
# Predict the subgroup of single cells

model_test_list <- purrr::map(names(classifier_list), # classifier model types
\(model_type) purrr::map(pseudobulk_metadata_df$title,
\(model_type) purrr::map(singlecell_metadata_df$title,
\(x) test_single_cells(sample_acc = x,
sce_filepath = here::here(pseudobulk_sce_dir,
stringr::str_c(x, "_sce.rds")),
metadata_df = pseudobulk_metadata_df,
sce_filepath = sce_files[x],
metadata_df = singlecell_metadata_df,
labels = mb_subgroups,
classifier = classifier_list[[model_type]],
study = "GSE119926",
platform = "scRNA-seq")) |>
purrr::set_names(pseudobulk_metadata_df$title)) |>
purrr::set_names(singlecell_metadata_df$title)) |>
purrr::set_names(names(classifier_list))

# Prep single cell plot data

get_umap_coord_and_cluster <- function(sample_title) {

sce_filepath <- here::here(pseudobulk_sce_dir,
stringr::str_c(sample_title, "_sce.rds"))
sce_filepath <- sce_files[sample_title]

sce_object <- readr::read_rds(sce_filepath)

Expand All @@ -122,7 +175,7 @@ get_umap_coord_and_cluster <- function(sample_title) {

}

umap_list <- purrr::map(pseudobulk_metadata_df$title,
umap_list <- purrr::map(singlecell_metadata_df$title,
\(sample_title) get_umap_coord_and_cluster(sample_title)) |>
dplyr::bind_rows()

Expand All @@ -146,7 +199,7 @@ single_cell_plot_df <- purrr::map2(model_test_list, # list of test objects and
dplyr::ungroup() |>
dplyr::left_join(umap_list,
by = c("sample_accession", "cell_index")) |>
dplyr::left_join(pseudobulk_metadata_df |>
dplyr::left_join(singlecell_metadata_df |>
dplyr::select(sample_accession,
study,
subgroup,
Expand Down

0 comments on commit 675c60c

Please sign in to comment.