Skip to content

Commit

Permalink
add pseudobulking functionality #7
Browse files Browse the repository at this point in the history
  • Loading branch information
sreichl committed Feb 13, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent 8bb337f commit 7efcd1f
Showing 7 changed files with 127 additions and 18 deletions.
7 changes: 7 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -67,6 +67,13 @@ split_by:
# example: "pass_QC == 'True'"
filter_expression: "mitoPercent <= 20"

##### PSEUDOBULK #####
# configure pseudobulking by metadata columns and aggregation method
# leave method empty "" to skip
pseudobulk:
by: ['patient', 'cellType', 'treatment'] # have to be metadata columns
method: "sum" # options: "sum", "mean", or "median"

##### NORMALIZATION #####
# using SCTransform defaults https://satijalab.org/seurat/reference/sctransform

19 changes: 11 additions & 8 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
@@ -68,19 +68,22 @@ rule all:
split=data_splits,
step=metadata_plot_steps,
),
normalized_plot_dirs = (expand(os.path.join(result_path,'{split}','{step}','plots','{plot_type}','{category}','{feature_list}'),
split=data_splits,
step=["NORMALIZED"],
plot_type=['VlnPlot', 'RidgePlot', 'DotPlot', 'Heatmap'],
category=config["vis_categories"],
feature_list=vis_features_normalized,
pseudobulk_counts = (expand(os.path.join(result_path,'{split}','PSEUDOBULK','RNA.csv'),
split=data_splits,
)) if (config["pseudobulk"]["method"]!="") else [],
normalized_plot_dirs = (expand(os.path.join(result_path,'{split}','{step}','plots','{plot_type}','{category}','{feature_list}'),
split=data_splits,
step=["NORMALIZED"],
plot_type=['VlnPlot', 'RidgePlot', 'DotPlot', 'Heatmap'],
category=config["vis_categories"],
feature_list=vis_features_normalized,
)) if (config["stop_after"]=="CORRECTED") else [],
corrected_plot_dirs = (expand(os.path.join(result_path,'{split}','{step}','plots','{plot_type}','{category}','{feature_list}'),
split=data_splits,
step=["CORRECTED"],
plot_type=['VlnPlot', 'RidgePlot', 'Heatmap'],
category=config["vis_categories"],
feature_list=vis_features_corrected,
category=config["vis_categories"],
feature_list=vis_features_corrected,
)) if (config["stop_after"]=="CORRECTED") else [],
envs = expand(os.path.join(config["result_path"],'envs','scrnaseq_processing_seurat','{env}.yaml'),env=['seurat','inspectdf']),
gene_lists = expand(os.path.join(config["result_path"],'configs','scrnaseq_processing_seurat','{gene_list}.txt'),gene_list=list(gene_list_dict.keys())),
1 change: 1 addition & 0 deletions workflow/envs/seurat.yaml
Original file line number Diff line number Diff line change
@@ -6,3 +6,4 @@ dependencies:
- r-seurat=4.0.6
- bioconductor-glmgampoi=1.6.0
- r-data.table=1.14.2
- r-dplyr=1.1.2
4 changes: 2 additions & 2 deletions workflow/rules/normalize_correct_score.smk
Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@ rule normalize:
norm_object = os.path.join(result_path,'{split}','NORMALIZED','object.rds'),
metadata = report(os.path.join(result_path,'{split}','NORMALIZED','metadata.csv'),
caption="../report/metadata.rst",
category="{}_scrnaseq_processing_seurat".format(config["project_name"]),
category="{}_{}".format(config["project_name"], module_name),
subcategory="{split}",
labels={
"step": "NORMALIZED",
@@ -48,7 +48,7 @@ rule correct:
norm_object = os.path.join(result_path,'{split}','CORRECTED','object.rds'),
metadata = report(os.path.join(result_path,'{split}','CORRECTED','metadata.csv'),
caption="../report/metadata.rst",
category="{}_scrnaseq_processing_seurat".format(config["project_name"]),
category="{}_{}".format(config["project_name"], module_name),
subcategory="{split}",
labels={
"step": "CORRECTED",
40 changes: 34 additions & 6 deletions workflow/rules/process.smk
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@ rule prepare:
sample_object = os.path.join(result_path,'batch_{sample}','PREP','object.rds'),
metadata = report(os.path.join(result_path,'batch_{sample}','PREP','metadata.csv'),
caption="../report/metadata_sample.rst",
category="{}_scrnaseq_processing_seurat".format(config["project_name"]),
category="{}_{}".format(config["project_name"], module_name),
subcategory="{sample}",
labels={
"step": "PREP",
@@ -45,7 +45,7 @@ rule merge:
merged_object = os.path.join(result_path,'merged','RAW','object.rds'),
metadata = report(os.path.join(result_path,'merged','RAW','metadata.csv'),
caption="../report/metadata_merged.rst",
category="{}_scrnaseq_processing_seurat".format(config["project_name"]),
category="{}_{}".format(config["project_name"], module_name),
subcategory="merged",
labels={
"step": "RAW",
@@ -80,7 +80,7 @@ rule split:
split_object = os.path.join(result_path,'{split}','RAW','object.rds'),
metadata = report(os.path.join(result_path,'{split}','RAW','metadata.csv'),
caption="../report/metadata.rst",
category="{}_scrnaseq_processing_seurat".format(config["project_name"]),
category="{}_{}".format(config["project_name"], module_name),
subcategory="{split}",
labels={
"step": "RAW",
@@ -115,7 +115,7 @@ rule filter_cells:
filtered_object = os.path.join(result_path,'{split}','FILTERED','object.rds'),
metadata = report(os.path.join(result_path,'{split}','FILTERED','metadata.csv'),
caption="../report/metadata.rst",
category="{}_scrnaseq_processing_seurat".format(config["project_name"]),
category="{}_{}".format(config["project_name"], module_name),
subcategory="{split}",
labels={
"step": "FILTERED",
@@ -140,8 +140,36 @@ rule filter_cells:
custom_flag = config["modality_flags"]['Custom'],
script:
"../scripts/filter.R"



# pseudobulk cells into samples
rule pseudobulk:
input:
filtered_object = os.path.join(result_path,'{split}','FILTERED','object.rds'),
output:
pseudobulk_counts = os.path.join(result_path,'{split}','PSEUDOBULK','RNA.csv'),
metadata = report(os.path.join(result_path,'{split}','PSEUDOBULK','metadata.csv'),
caption="../report/metadata.rst",
category="{}_{}".format(config["project_name"], module_name),
subcategory="{split}",
labels={
"step": "PSEUDOBULK",
"type": "CSV",
"category": "All",
"list": "Metadata",
"feature": "",
}),
resources:
mem_mb=config.get("mem", "16000"),
threads: config.get("threads", 1)
conda:
"../envs/seurat.yaml"
log:
os.path.join("logs","rules","pseudobulk_{split}.log"),
params:
partition=config.get("partition"),
script:
"../scripts/pseudobulk.R"

# save counts as CSV of seurat object
rule save_counts:
input:
4 changes: 2 additions & 2 deletions workflow/rules/visualize.smk
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@ rule metadata_plots:
metadata_plots = report(directory(os.path.join(result_path,'{split}','{step}','plots','metadata')),
patterns=["{datatype}.png"],
caption="../report/metadata_vis.rst",
category="{}_scrnaseq_processing_seurat".format(config["project_name"]),
category="{}_{}".format(config["project_name"], module_name),
subcategory="{split}",
labels={
"step": "{step}",
@@ -37,7 +37,7 @@ rule seurat_plots:
directory(os.path.join(result_path,'{split}','{step}','plots','{plot_type}','{category}','{feature_list}')),
patterns=["{feature}.png"],
caption="../report/seurat_plot.rst",
category="{}_scrnaseq_processing_seurat".format(config["project_name"]),
category="{}_{}".format(config["project_name"], module_name),
subcategory="{split}",
labels={
"step": "{step}",
70 changes: 70 additions & 0 deletions workflow/scripts/pseudobulk.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@

#### load libraries & utility function
library("Seurat")
library("data.table")
library("dplyr")

# source utility functions
# source("workflow/scripts/utils.R")
# snakemake@source("./utils.R")

# inputs
filtered_object_path <- snakemake@input[["filtered_object"]]

# outputs
pseudobulk_counts_path <- snakemake@output[["pseudobulk_counts"]]

# parameters
ab_flag <- snakemake@config[["modality_flags"]][['Antibody_Capture']]
crispr_flag <- snakemake@config[["modality_flags"]][['CRISPR_Guide_Capture']]
custom_flag <- snakemake@config[["modality_flags"]][['Custom']]

pseudobulk_by <- snakemake@config[["pseudobulk"]][["by"]]
pseudobulk_method <- match.fun(snakemake@config[["pseudobulk"]][["method"]]) # can be "sum", "mean", or "median"

### load filtered data
seurat_object <- readRDS(file = file.path(filtered_object_path))

# get metadata
metadata <- as.data.frame(seurat_object[[]])
metadata$ID <- rownames(metadata)

# pseudobulk per modality (RNA, ...)
for (modality in c("RNA", ab_flag, crispr_flag, custom_flag)){
if (modality==""){
next
}

# extract data
tmp_data <- as.data.frame(t(as.data.frame(GetAssayData(object = seurat_object, slot = "counts", assay = modality))))
features <- colnames(tmp_data)
tmp_data$ID <- rownames(tmp_data)

# join with metadata
tmp_data <- inner_join(tmp_data, metadata, by = "ID")

# pseudobulk by method
tmp_pseudobulk <- tmp_data %>%
group_by(across(all_of(pseudobulk_by))) %>%
summarise(across(all_of(features), pseudobulk_method, .names = "{.col}"), .groups = "drop")

# reformat df and create metadata
tmp_pseudobulk <- as.data.frame(tmp_pseudobulk)
rownames(tmp_pseudobulk) <- apply(tmp_pseudobulk[, pseudobulk_by], 1, function(x) paste(x, collapse = "_"))
tmp_pseudobulk[,pseudobulk_by] <- NULL
tmp_pseudobulk <- as.data.frame(t(tmp_pseudobulk))

# save pseudobulked data frame
fwrite(as.data.frame(tmp_pseudobulk), file=file.path(dirname(pseudobulk_counts_path),paste0(modality,".csv")), row.names=TRUE)
}


# generate aggregated metadata sheet
metadata_aggregated <- metadata %>%
group_by(across(all_of(pseudobulk_by))) %>%
summarise(cell_count = n(), .groups = "drop")
# format metadata
metadata_aggregated <- as.data.frame(metadata_aggregated)
rownames(metadata_aggregated) <- apply(metadata_aggregated[, pseudobulk_by], 1, function(x) paste(x, collapse = "_"))
# save metadata
fwrite(as.data.frame(metadata_aggregated), file=file.path(dirname(pseudobulk_counts_path),"metadata.csv"), row.names=TRUE)

0 comments on commit 7efcd1f

Please sign in to comment.