From c48fc3b4dcfd4df96d1d20503d2ae936a4d6951d Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 28 Nov 2023 09:13:54 -0500 Subject: [PATCH 01/35] feat: add Rmd for diffbind from legacy pipeliner source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/Results-template/Scripts/DiffBind_pipeliner.Rmd --- bin/DiffBind_pipeliner.Rmd | 198 +++++++++++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 bin/DiffBind_pipeliner.Rmd diff --git a/bin/DiffBind_pipeliner.Rmd b/bin/DiffBind_pipeliner.Rmd new file mode 100644 index 00000000..f4d75622 --- /dev/null +++ b/bin/DiffBind_pipeliner.Rmd @@ -0,0 +1,198 @@ +--- +title: "DiffBind: CCBR ChIP-seq pipeline" +output: + html_document: + toc: true + toc_depth: 2 +params: + csvfile: samplesheet.csv + contrasts: "group1_vs_group2" + peakcaller: "macs" + projectID: "" + projectDesc: "" +--- + + + +```{r, include=FALSE, warning=FALSE, message=FALSE} +## grab args +projectID <- params$projectID +projectDesc <- params$projectDesc +dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") + +csvfile <- params$csvfile +contrasts <- params$contrasts +peakcaller <- params$peakcaller +``` + +### **Project:** +#### *`r projectID`* +### **Description:** +#### *`r projectDesc`* +### **Groups being compared:** +#### *`r contrasts`* +### **Peak sources:** +#### *`r peakcaller`* +### **Report generated:** +#### *`r dateandtime`* + +```{r setup, echo=FALSE, warning=FALSE,message=FALSE} +knitr::opts_chunk$set(echo = TRUE) +suppressMessages(library(DT)) +suppressMessages(library(DiffBind)) +``` + +
+ +## Read in sample sheet information and peak information +```{r samples, echo=FALSE, warning=FALSE,message=FALSE} +samples <- dba(sampleSheet = csvfile) +consensus <- dba.peakset(samples, consensus = DBA_CONDITION) +print(samples) +``` + +
+ +## Plot raw information about the peaks +### Correlation heatmap: Only peaks +```{r heatmap1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} +try(plot(samples, main = ""), silent = TRUE) +``` + +### PCA: Only peaks +```{r PCA1, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center",fig.caption="PCA:\nOnlyPeaks"} +try(dba.plotPCA(samples, DBA_CONDITION), silent = TRUE) +``` + +### Overlapping peak counts +```{r Venn, echo=FALSE, warning=FALSE,message=FALSE,fig.align="center",fig.height=5,fig.width=5} +if (nrow(samples$samples) < 5) { + dba.plotVenn(samples, 1:nrow(samples$samples)) +} else { + dba.plotVenn(samples, samples$masks[[3]]) + dba.plotVenn(samples, samples$masks[[4]]) + dba.plotVenn(consensus, consensus$masks$Consensus) +} +``` + +```{r peaksORsummits, echo=F} +if (grepl("narrow", samples$samples$Peaks[1])) { + summits <- TRUE + print("Narrow peak calling tool.") + print("Differential peaks are 250bp upstream and downstream of the summits.") +} else if (grepl("broad", samples$samples$Peaks[1])) { + summits <- FALSE + print("Broad peak calling tool.") + print("Differential peaks are consensus peaks.") +} else { + summits <- FALSE + print("Indeterminate peak calling tool.") + print("Differential peaks are consensus peaks.") +} +``` + +## Read in bam file information under all peaks found in at least two samples +```{r DBcount, echo=FALSE, warning=FALSE,message=FALSE} +if (summits == TRUE) { + DBdataCounts <- dba.count(samples, summits = 250) +} else { + DBdataCounts <- dba.count(samples) +} +print(DBdataCounts) +``` + +
+ +## Plot raw information about all analyzed peaks +### Correlation heatmap: Peaks and reads +```{r heatmap2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} +try(plot(DBdataCounts, main = ""), silent = TRUE) +``` + +### Heatmap: Average signal across each peak +```{r heatmap3, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} +try(dba.plotHeatmap(DBdataCounts, correlations = FALSE), silent = TRUE) +``` + +### PCA: Peaks and reads +```{r PCA2, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"} +try(dba.plotPCA(DBdataCounts, DBA_CONDITION), silent = TRUE) +``` + +## Associate individual samples with the different contrasts +```{r contrast, echo=FALSE, warning=FALSE,message=FALSE} +DBdatacontrast <- dba.contrast(DBdataCounts, minMembers = 2, categories = DBA_CONDITION) +print(DBdatacontrast) +``` + +
+ +## Call differential peaks using Deseq2 and EdgeR +```{r analyze, echo=FALSE, warning=FALSE,message=FALSE} +DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) +DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) +``` + +```{r, echo=FALSE, warning=FALSE,message=FALSE} +DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2) +DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER) +``` + +### PCA: DeSeq2 +```{r PCA3, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"} +try(dba.plotPCA(DBAnalysisDeseq2, contrast = 1, method = DBA_DESEQ2), silent = TRUE) +``` + +### PCA: EdgeR +```{r PCA4, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"} +try(dba.plotPCA(DBAnalysisEdgeR, contrast = 1, method = DBA_EDGER), silent = TRUE) +``` + +### MANorm: (left) Deseq2, (right) EdgeR +```{r MA, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"} +par(mfcol = c(1, 2)) +try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2), silent = TRUE) +try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER), silent = TRUE) +``` + +### Volcano plot: DeSeq2 +```{r Volcano1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} +try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2), silent = TRUE) +``` + +### Volcano plot: EdgeR +```{r Volcano2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} +try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER), silent = TRUE) +``` + +### Boxplots: (left) Deseq2, (right) EdgeR +```{r BoxPlot, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"} +par(mfcol = c(1, 2)) +if (length(DBReportDeseq2) > 0) { + try(dba.plotBox(DBAnalysisDeseq2, method = DBA_DESEQ2), silent = TRUE) +} else { + plot(0, type = "n", axes = FALSE, ann = FALSE) +} +try(dba.plotBox(DBAnalysisEdgeR, method = DBA_EDGER), silent = TRUE) +``` + +## Differentially bound peaks: Deseq2 output +```{r Deseq2Report, echo=FALSE, warning=FALSE,message=FALSE} +outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.txt") +write.table(DBReportDeseq2, outfile, quote = F, sep = "\t", row.names = F) +DT::datatable(data.frame(DBReportDeseq2), rownames = F) +``` + +## Differentially bound peaks: EdgeR output +```{r EdgeRReport, echo=FALSE, warning=FALSE,message=FALSE} +outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_EdgeR.txt") +write.table(DBReportEdgeR, outfile, quote = F, sep = "\t", row.names = F) +DT::datatable(data.frame(DBReportEdgeR), rownames = F) +``` + +## R tool version information +```{r Info, echo=FALSE, message=FALSE, warning=FALSE} +sessionInfo() +``` From ecf0116a83ae3d32bfaf4e4df61a9381007d4d7c Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Mon, 4 Dec 2023 12:40:00 -0500 Subject: [PATCH 02/35] test: add columns for diff groups --- assets/samplesheet_test.csv | 14 +++++++------- assets/samplesheet_test_mm10.csv | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/assets/samplesheet_test.csv b/assets/samplesheet_test.csv index bf590be8..4eb97a99 100644 --- a/assets/samplesheet_test.csv +++ b/assets/samplesheet_test.csv @@ -1,7 +1,7 @@ -sample,rep,fastq_1,fastq_2,antibody,control -SPT5_T0,1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822153_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822153_2.fastq.gz,SPT5,SPT5_INPUT_1 -SPT5_T0,2,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822154_1.fastq.gz,,SPT5,SPT5_INPUT_2 -SPT5_T15,1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822157_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822157_2.fastq.gz,SPT5,SPT5_INPUT_1 -SPT5_T15,2,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822158_1.fastq.gz,,SPT5,SPT5_INPUT_2 -SPT5_INPUT_1,,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R2.fastq.gz,, -SPT5_INPUT_2,,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,,, +sample,rep,fastq_1,fastq_2,antibody,control,condition +SPT5_T0,1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822153_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822153_2.fastq.gz,SPT5,SPT5_INPUT_1,treatment, +SPT5_T0,2,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822154_1.fastq.gz,,SPT5,SPT5_INPUT_2,treatment, +SPT5_T15,1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822157_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822157_2.fastq.gz,SPT5,SPT5_INPUT_1,control +SPT5_T15,2,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822158_1.fastq.gz,,SPT5,SPT5_INPUT_2,control +SPT5_INPUT_1,,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R2.fastq.gz,,, +SPT5_INPUT_2,,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,,,, diff --git a/assets/samplesheet_test_mm10.csv b/assets/samplesheet_test_mm10.csv index 1f215ecb..ed410808 100644 --- a/assets/samplesheet_test_mm10.csv +++ b/assets/samplesheet_test_mm10.csv @@ -1,4 +1,4 @@ -sample,rep,fastq_1,fastq_2,antibody,control -CTCF_ChIP_macrophage_p20,,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081748_1.fastq.gz,,CTCF,WCE_p20 -CTCF_ChIP_MEF_p20,,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081752_1.fastq.gz,,CTCF,WCE_p20 -WCE_p20,,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081773_1.fastq.gz,,, +sample,rep,fastq_1,fastq_2,antibody,control,diagnosis +CTCF_ChIP_macrophage_p20,,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081748_1.fastq.gz,,CTCF,WCE_p20,cancer +CTCF_ChIP_MEF_p20,,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081752_1.fastq.gz,,CTCF,WCE_p20,normal +WCE_p20,,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081773_1.fastq.gz,,,, From fe0010f5459f28d719c9f96ba2a03f7c6455adb3 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Mon, 4 Dec 2023 19:27:26 -0500 Subject: [PATCH 03/35] feat: create module to check contrasts & write modified samplesheet --- assets/contrasts_full_mm10.yml | 12 +++++ assets/contrasts_test.yml | 5 ++ assets/contrasts_test_mm10.yml | 5 ++ assets/samplesheet_test.csv | 14 ++--- assets/samplesheet_test_mm10.csv | 8 +-- modules/local/check_contrasts/main.nf | 17 ++++++ .../templates/check_contrasts.R | 53 +++++++++++++++++++ modules/local/diffbind/main.nf | 0 modules/local/samplesheet_check.nf | 1 + nextflow.config | 1 + 10 files changed, 105 insertions(+), 11 deletions(-) create mode 100644 assets/contrasts_full_mm10.yml create mode 100644 assets/contrasts_test.yml create mode 100644 assets/contrasts_test_mm10.yml create mode 100644 modules/local/check_contrasts/main.nf create mode 100644 modules/local/check_contrasts/templates/check_contrasts.R create mode 100644 modules/local/diffbind/main.nf diff --git a/assets/contrasts_full_mm10.yml b/assets/contrasts_full_mm10.yml new file mode 100644 index 00000000..761a435f --- /dev/null +++ b/assets/contrasts_full_mm10.yml @@ -0,0 +1,12 @@ +antibody: + p20: + - CTCF_ChIP_macrophage_p20 + - CTCF_ChIP_MEF_p20 + p3: + - CTCF_ChIP_macrophage_p3 +celltype: + macrophage: + - CTCF_ChIP_macrophage_p20 + - CTCF_ChIP_macrophage_p3 + cancer: + - CTCF_ChIP_MEF_p20 diff --git a/assets/contrasts_test.yml b/assets/contrasts_test.yml new file mode 100644 index 00000000..dbb7f7bb --- /dev/null +++ b/assets/contrasts_test.yml @@ -0,0 +1,5 @@ +timepoint: + T0: + - SPT5_T0 + T15: + - SPT5_T15 diff --git a/assets/contrasts_test_mm10.yml b/assets/contrasts_test_mm10.yml new file mode 100644 index 00000000..bfc81f58 --- /dev/null +++ b/assets/contrasts_test_mm10.yml @@ -0,0 +1,5 @@ +celltype: + macrophage: + - CTCF_ChIP_macrophage_p20 + MEF: + - CTCF_ChIP_MEF_p20 diff --git a/assets/samplesheet_test.csv b/assets/samplesheet_test.csv index 4eb97a99..bf590be8 100644 --- a/assets/samplesheet_test.csv +++ b/assets/samplesheet_test.csv @@ -1,7 +1,7 @@ -sample,rep,fastq_1,fastq_2,antibody,control,condition -SPT5_T0,1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822153_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822153_2.fastq.gz,SPT5,SPT5_INPUT_1,treatment, -SPT5_T0,2,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822154_1.fastq.gz,,SPT5,SPT5_INPUT_2,treatment, -SPT5_T15,1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822157_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822157_2.fastq.gz,SPT5,SPT5_INPUT_1,control -SPT5_T15,2,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822158_1.fastq.gz,,SPT5,SPT5_INPUT_2,control -SPT5_INPUT_1,,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R2.fastq.gz,,, -SPT5_INPUT_2,,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,,,, +sample,rep,fastq_1,fastq_2,antibody,control +SPT5_T0,1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822153_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822153_2.fastq.gz,SPT5,SPT5_INPUT_1 +SPT5_T0,2,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822154_1.fastq.gz,,SPT5,SPT5_INPUT_2 +SPT5_T15,1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822157_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822157_2.fastq.gz,SPT5,SPT5_INPUT_1 +SPT5_T15,2,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822158_1.fastq.gz,,SPT5,SPT5_INPUT_2 +SPT5_INPUT_1,,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R2.fastq.gz,, +SPT5_INPUT_2,,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,,, diff --git a/assets/samplesheet_test_mm10.csv b/assets/samplesheet_test_mm10.csv index ed410808..1f215ecb 100644 --- a/assets/samplesheet_test_mm10.csv +++ b/assets/samplesheet_test_mm10.csv @@ -1,4 +1,4 @@ -sample,rep,fastq_1,fastq_2,antibody,control,diagnosis -CTCF_ChIP_macrophage_p20,,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081748_1.fastq.gz,,CTCF,WCE_p20,cancer -CTCF_ChIP_MEF_p20,,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081752_1.fastq.gz,,CTCF,WCE_p20,normal -WCE_p20,,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081773_1.fastq.gz,,,, +sample,rep,fastq_1,fastq_2,antibody,control +CTCF_ChIP_macrophage_p20,,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081748_1.fastq.gz,,CTCF,WCE_p20 +CTCF_ChIP_MEF_p20,,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081752_1.fastq.gz,,CTCF,WCE_p20 +WCE_p20,,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081773_1.fastq.gz,,, diff --git a/modules/local/check_contrasts/main.nf b/modules/local/check_contrasts/main.nf new file mode 100644 index 00000000..ed6792b2 --- /dev/null +++ b/modules/local/check_contrasts/main.nf @@ -0,0 +1,17 @@ +process CHECK_CONTRASTS { + tag { contrasts } + label 'process_single' + + container 'nciccbr/consensus_peaks:v1.1' + + input: + path(samplesheet), path(contrasts) + + output: + path("*.csv"), emit: csv + path("versions.yml"), emit: versions + + script: + output_filename = "contrasts.valid.csv" + template 'check_contrasts.R' +} diff --git a/modules/local/check_contrasts/templates/check_contrasts.R b/modules/local/check_contrasts/templates/check_contrasts.R new file mode 100644 index 00000000..2e304394 --- /dev/null +++ b/modules/local/check_contrasts/templates/check_contrasts.R @@ -0,0 +1,53 @@ +#!/usr/bin/env Rscript +library(assertthat) +library(dplyr) +library(glue) + +main <- function(contrasts_filename = "${contrasts}", + samplesheet_filename = "${samplesheet}", + output_basename = "${output_basename}", + versions_filename = "versions.yml") { + write_version(versions_filename) + contrasts_lst <- yaml::read_yaml(contrasts_filename) + samples_df <- readr::read_csv(samplesheet_filename) + + assert_that( + all(unlist(contrasts_lst) %in% (samples_df %>% dplyr::pull(sample))), + glue("All sample names in contrasts must also be in the sample sheet") + ) + names(contrasts_lst) %>% lapply(check_contrast, contrasts_lst) + names(contrasts_lst) %>% lapply( + write_contrast_samplesheet, + contrasts_lst, samples_df, output_basename + ) +} + +check_contrast <- function(contrast_name, contrasts_lst) { + contrast_len <- length(contrasts_lst[[contrast_name]]) + assert_that( + contrast_len == 2, + glue("Contrasts must have only two groups per comparison, but {contrast_name} has {contrast_len}") + ) +} + +write_contrast_samplesheet <- function(contrast_name, contrasts_lst, samples_df, output_basename) { + contrast_df <- contrasts_lst[[contrast_name]] %>% + tibble::as_tibble() %>% + tidyr::pivot_longer( + cols = everything(), + names_to = "group", values_to = "sample" + ) + samples_df %>% + dplyr::inner_join(contrast_df) %>% + readr::write_csv(glue("{output_basename}.{contrast_name}.csv")) +} + +get_version <- function() { + return(paste0(R.version[["major"]], ".", R.version[["minor"]])) +} + +write_version <- function(version_file) { + write_lines(get_version(), version_file) +} + +main() diff --git a/modules/local/diffbind/main.nf b/modules/local/diffbind/main.nf new file mode 100644 index 00000000..e69de29b diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index dd421b84..c672d27a 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -1,6 +1,7 @@ // source: https://github.com/nf-core/chipseq/blob/51eba00b32885c4d0bec60db3cb0a45eb61e34c5/modules/local/samplesheet_check.nf process SAMPLESHEET_CHECK { tag "$samplesheet" + label 'process_single' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : diff --git a/nextflow.config b/nextflow.config index f4c2e12a..ca6d9a2e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -2,6 +2,7 @@ nextflow.enable.dsl = 2 params { input = null + contrasts = null seq_center = null read_length = null genome = null From 88e2412020eefabd05edbc729bfc22130b46e30f Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 5 Dec 2023 11:35:52 -0500 Subject: [PATCH 04/35] fix: call check_contrasts in main.nf --- assets/contrasts_test.yml | 5 ++++ conf/full_mm10.config | 1 + conf/test.config | 1 + conf/test_mm10.config | 1 + main.nf | 26 +++++++++++++++---- modules/local/check_contrasts/main.nf | 5 ++-- .../templates/check_contrasts.R | 25 +++++++++++++----- 7 files changed, 50 insertions(+), 14 deletions(-) diff --git a/assets/contrasts_test.yml b/assets/contrasts_test.yml index dbb7f7bb..059fa4a2 100644 --- a/assets/contrasts_test.yml +++ b/assets/contrasts_test.yml @@ -3,3 +3,8 @@ timepoint: - SPT5_T0 T15: - SPT5_T15 +condition: + control: + - SPT5_T0 + treatment: + - SPT5_T15 diff --git a/conf/full_mm10.config b/conf/full_mm10.config index b069408d..730a185b 100644 --- a/conf/full_mm10.config +++ b/conf/full_mm10.config @@ -5,6 +5,7 @@ params { genome = 'mm10' outdir = "results/full_mm10" input = "assets/samplesheet_full_mm10.csv" + contrasts = 'assets/contrasts_full_mm10.yml' sicer { species = "mm10" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py } diff --git a/conf/test.config b/conf/test.config index c023360a..ddd6831a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -4,6 +4,7 @@ params { outdir = 'results/test' input = 'assets/samplesheet_test.csv' // adapted from 'https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/samplesheet/v2.0/samplesheet_test.csv' + contrasts = 'assets/contrasts_test.yml' genome = 'sacCer3' read_length = 50 diff --git a/conf/test_mm10.config b/conf/test_mm10.config index 12277157..44160c46 100644 --- a/conf/test_mm10.config +++ b/conf/test_mm10.config @@ -5,6 +5,7 @@ params { genome = 'mm10' outdir = "results/test_mm10" input = "assets/samplesheet_test_mm10.csv" + contrats = 'assets/contrasts_test_mm10.yml' read_length = 50 sicer.species = "mm10" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py diff --git a/main.nf b/main.nf index eb236101..05ad73ef 100644 --- a/main.nf +++ b/main.nf @@ -18,7 +18,6 @@ log.info """\ .stripIndent() // SUBWORKFLOWS - include { INPUT_CHECK } from './subworkflows/local/input_check.nf' include { PREPARE_GENOME } from './subworkflows/local/prepare_genome.nf' include { FILTER_BLACKLIST } from './subworkflows/CCBR/filter_blacklist/' @@ -31,10 +30,10 @@ include { ANNOTATE } from './subworkflows/local/annotate.nf' // MODULES include { CUTADAPT } from "./modules/CCBR/cutadapt" -include { PHANTOM_PEAKS } from "./modules/local/qc.nf" -include { PPQT_PROCESS +include { PHANTOM_PEAKS + PPQT_PROCESS MULTIQC } from "./modules/local/qc.nf" -include { NORMALIZE_INPUT } from "./modules/local/deeptools.nf" +include { CHECK_CONTRASTS } from "./modules/local/check_contrasts/" workflow.onComplete { if (!workflow.stubRun && !workflow.commandLine.contains('-preview')) { @@ -55,7 +54,12 @@ workflow { } workflow CHIPSEQ { - INPUT_CHECK(file(params.input), params.seq_center) + sample_sheet = file(params.input, checkIfExists: true) + INPUT_CHECK(sample_sheet, params.seq_center) + if (params.contrasts) { + CHECK_CONTRASTS(sample_sheet, file(params.contrasts, checkIfExists: true)) + } + INPUT_CHECK.out.reads.set { raw_fastqs } raw_fastqs | CUTADAPT CUTADAPT.out.reads.set{ trimmed_fastqs } @@ -110,6 +114,18 @@ workflow CHIPSEQ { PREPARE_GENOME.out.bioc_annot) ch_multiqc = ch_multiqc.mix(CALL_PEAKS.out.plots, ANNOTATE.out.plots) + // retrieve sample basename and peak-calling tool from metadata + CONSENSUS_PEAKS.out.peaks + .map{ meta, bed -> + meta_split = meta.id.tokenize('.') + assert meta_split.size() == 2 + [ [ id: meta_split[0], tool: meta_split[1] ], bed ] + } + .set{ ch_consensus_peaks } + if (params.contrasts) { + CHECK_CONTRASTS.out.csv + .flatten() | view + } } MULTIQC( diff --git a/modules/local/check_contrasts/main.nf b/modules/local/check_contrasts/main.nf index ed6792b2..fec53df3 100644 --- a/modules/local/check_contrasts/main.nf +++ b/modules/local/check_contrasts/main.nf @@ -5,13 +5,14 @@ process CHECK_CONTRASTS { container 'nciccbr/consensus_peaks:v1.1' input: - path(samplesheet), path(contrasts) + path(samplesheet) + path(contrasts) output: path("*.csv"), emit: csv path("versions.yml"), emit: versions script: - output_filename = "contrasts.valid.csv" + output_basename = "sample_contrasts" template 'check_contrasts.R' } diff --git a/modules/local/check_contrasts/templates/check_contrasts.R b/modules/local/check_contrasts/templates/check_contrasts.R index 2e304394..dacb2664 100644 --- a/modules/local/check_contrasts/templates/check_contrasts.R +++ b/modules/local/check_contrasts/templates/check_contrasts.R @@ -6,14 +6,15 @@ library(glue) main <- function(contrasts_filename = "${contrasts}", samplesheet_filename = "${samplesheet}", output_basename = "${output_basename}", - versions_filename = "versions.yml") { - write_version(versions_filename) + versions_filename = "versions.yml", + process_name = "${task.process}") { + write_version(versions_filename, process_name = process_name) contrasts_lst <- yaml::read_yaml(contrasts_filename) samples_df <- readr::read_csv(samplesheet_filename) assert_that( all(unlist(contrasts_lst) %in% (samples_df %>% dplyr::pull(sample))), - glue("All sample names in contrasts must also be in the sample sheet") + msg = glue("All sample names in contrasts must also be in the sample sheet") ) names(contrasts_lst) %>% lapply(check_contrast, contrasts_lst) names(contrasts_lst) %>% lapply( @@ -22,14 +23,16 @@ main <- function(contrasts_filename = "${contrasts}", ) } +#' Ensure contrast has exactly two groups to compare check_contrast <- function(contrast_name, contrasts_lst) { contrast_len <- length(contrasts_lst[[contrast_name]]) assert_that( contrast_len == 2, - glue("Contrasts must have only two groups per comparison, but {contrast_name} has {contrast_len}") + msg = glue("Contrasts must have only two groups per comparison, but {contrast_name} has {contrast_len}") ) } +#' Combine sample sheet with contrast group write_contrast_samplesheet <- function(contrast_name, contrasts_lst, samples_df, output_basename) { contrast_df <- contrasts_lst[[contrast_name]] %>% tibble::as_tibble() %>% @@ -42,12 +45,20 @@ write_contrast_samplesheet <- function(contrast_name, contrasts_lst, samples_df, readr::write_csv(glue("{output_basename}.{contrast_name}.csv")) } -get_version <- function() { +#' Get R version as a semantic versioning string +get_r_version <- function() { return(paste0(R.version[["major"]], ".", R.version[["minor"]])) } -write_version <- function(version_file) { - write_lines(get_version(), version_file) +#' Write R version to a yaml file +write_version <- function(version_file, process_name = "${task.process}") { + readr::write_lines( + glue('"{process_name}":', + "R: {get_r_version()}", + .sep = "\\n\\t" + ), + version_file + ) } main() From fa61c9d9c34762833889177e6ace438b052d5c5c Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 5 Dec 2023 12:32:32 -0500 Subject: [PATCH 05/35] fix: use validated csv, include contrast in output --- .../local/check_contrasts/templates/check_contrasts.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/local/check_contrasts/templates/check_contrasts.R b/modules/local/check_contrasts/templates/check_contrasts.R index dacb2664..b75a09e5 100644 --- a/modules/local/check_contrasts/templates/check_contrasts.R +++ b/modules/local/check_contrasts/templates/check_contrasts.R @@ -13,7 +13,7 @@ main <- function(contrasts_filename = "${contrasts}", samples_df <- readr::read_csv(samplesheet_filename) assert_that( - all(unlist(contrasts_lst) %in% (samples_df %>% dplyr::pull(sample))), + all(unlist(contrasts_lst) %in% (samples_df %>% dplyr::pull(sample_basename))), msg = glue("All sample names in contrasts must also be in the sample sheet") ) names(contrasts_lst) %>% lapply(check_contrast, contrasts_lst) @@ -38,10 +38,12 @@ write_contrast_samplesheet <- function(contrast_name, contrasts_lst, samples_df, tibble::as_tibble() %>% tidyr::pivot_longer( cols = everything(), - names_to = "group", values_to = "sample" - ) + names_to = "group", values_to = "sample_basename" + ) %>% + mutate(contrast = contrast_name) + print(head(contrast_df)) samples_df %>% - dplyr::inner_join(contrast_df) %>% + dplyr::inner_join(contrast_df, by = "sample_basename") %>% readr::write_csv(glue("{output_basename}.{contrast_name}.csv")) } From aa2327de22faaaccc37fec864933aaeaa17ec160 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 5 Dec 2023 12:33:11 -0500 Subject: [PATCH 06/35] feat: emit validated csv --- subworkflows/local/input_check.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index d338003b..36602633 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -11,14 +11,15 @@ workflow INPUT_CHECK { seq_center // string: sequencing center for read group main: - SAMPLESHEET_CHECK ( samplesheet ) - .csv + valid_csv = SAMPLESHEET_CHECK( samplesheet ).csv + valid_csv .splitCsv ( header:true, sep:',' ) .map { create_fastq_channel(it, seq_center) } .set { reads } emit: reads // channel: [ val(meta), [ reads ] ] + csv = valid_csv versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } From aa117061e75b19b2934068b427f3efba22c5dbef Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 5 Dec 2023 12:46:15 -0500 Subject: [PATCH 07/35] feat: create diff subworkflow combine peaks with contrast meta --- main.nf | 15 ++++------ subworkflows/local/diff/main.nf | 49 +++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 9 deletions(-) create mode 100644 subworkflows/local/diff/main.nf diff --git a/main.nf b/main.nf index 05ad73ef..0dbda96e 100644 --- a/main.nf +++ b/main.nf @@ -27,13 +27,13 @@ include { QC } from './subworkflows/local/qc.nf' include { CALL_PEAKS } from './subworkflows/local/peaks.nf' include { CONSENSUS_PEAKS } from './subworkflows/CCBR/consensus_peaks/' include { ANNOTATE } from './subworkflows/local/annotate.nf' +include { DIFF } from './subworkflows/local/diff/' // MODULES include { CUTADAPT } from "./modules/CCBR/cutadapt" include { PHANTOM_PEAKS PPQT_PROCESS MULTIQC } from "./modules/local/qc.nf" -include { CHECK_CONTRASTS } from "./modules/local/check_contrasts/" workflow.onComplete { if (!workflow.stubRun && !workflow.commandLine.contains('-preview')) { @@ -54,11 +54,7 @@ workflow { } workflow CHIPSEQ { - sample_sheet = file(params.input, checkIfExists: true) - INPUT_CHECK(sample_sheet, params.seq_center) - if (params.contrasts) { - CHECK_CONTRASTS(sample_sheet, file(params.contrasts, checkIfExists: true)) - } + INPUT_CHECK(file(params.input, checkIfExists: true), params.seq_center) INPUT_CHECK.out.reads.set { raw_fastqs } raw_fastqs | CUTADAPT @@ -119,12 +115,13 @@ workflow CHIPSEQ { .map{ meta, bed -> meta_split = meta.id.tokenize('.') assert meta_split.size() == 2 - [ [ id: meta_split[0], tool: meta_split[1] ], bed ] + [ [ sample_basename: meta_split[0], tool: meta_split[1] ], bed ] } .set{ ch_consensus_peaks } if (params.contrasts) { - CHECK_CONTRASTS.out.csv - .flatten() | view + contrasts = file(params.contrasts, checkIfExists: true) + DIFF( ch_consensus_peaks, INPUT_CHECK.out.csv, contrasts ) + } } diff --git a/subworkflows/local/diff/main.nf b/subworkflows/local/diff/main.nf new file mode 100644 index 00000000..e22f4b76 --- /dev/null +++ b/subworkflows/local/diff/main.nf @@ -0,0 +1,49 @@ +include { CHECK_CONTRASTS } from "../../../modules/local/check_contrasts/" + +workflow DIFF { + take: + peaks + samplesheet_file + contrasts_file + + main: + + CHECK_CONTRASTS(samplesheet_file, contrasts_file) + .csv + .flatten() + .splitCsv( header: true, sep: ',' ) + .map{ it -> + meta = get_contrast_meta(it) + [ meta.sample_basename, [group: meta.group, contrast: meta.contrast] ] + } + .set{ contrasts } + peaks.map{ meta, bed -> + [ meta.sample_basename, meta, bed] + } + .cross( contrasts ) + .map{ it.flatten() } + .map{ sample_basename1, peak_meta, bed, sample_basename2, con_meta -> + assert sample_basename1 == sample_basename2 + meta = peak_meta + con_meta + [ meta, bed ] + } + .set{ ch_peaks_contrasts } + + emit: + diff_peaks = peaks + +} + +def get_contrast_meta(LinkedHashMap row) { + def meta = [:] + meta.id = row.sample + meta.sample_basename = row.sample_basename + meta.rep = row.rep + meta.single_end = row.single_end.toBoolean() + meta.antibody = row.antibody + meta.control = row.control + meta.group = row.group + meta.contrast = row.contrast + + return meta +} From d53ac8d3964a79f5a1c19406633c292b292c752d Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 5 Dec 2023 16:17:59 -0500 Subject: [PATCH 08/35] fix: also need bam files for diffbind --- main.nf | 3 ++- subworkflows/local/peaks.nf | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 0dbda96e..3025cfb0 100644 --- a/main.nf +++ b/main.nf @@ -120,7 +120,8 @@ workflow CHIPSEQ { .set{ ch_consensus_peaks } if (params.contrasts) { contrasts = file(params.contrasts, checkIfExists: true) - DIFF( ch_consensus_peaks, INPUT_CHECK.out.csv, contrasts ) + // TODO use consensus peaks for regions of interest in diffbind + DIFF( CALL_PEAKS.out.bam_peaks, INPUT_CHECK.out.csv, contrasts ) } } diff --git a/subworkflows/local/peaks.nf b/subworkflows/local/peaks.nf index b5d71062..20ad040b 100644 --- a/subworkflows/local/peaks.nf +++ b/subworkflows/local/peaks.nf @@ -104,9 +104,8 @@ workflow CALL_PEAKS { .map{ meta1, bam, bai, meta2, peak, tool -> meta1 == meta2 ? [ meta1, bam, bai, peak, tool ] : null } - .combine(chrom_sizes) .set{ ch_bam_peaks } - ch_bam_peaks | FRACTION_IN_PEAKS + ch_bam_peaks.combine(chrom_sizes) | FRACTION_IN_PEAKS FRACTION_IN_PEAKS.out.collect() | CONCAT_FRIPS | PLOT_FRIP ch_peaks @@ -117,7 +116,7 @@ workflow CALL_PEAKS { .combine(chrom_sizes) | JACCARD_INDEX JACCARD_INDEX.out.collect() | CONCAT_JACCARD | PLOT_JACCARD - ch_bam_peaks | GET_PEAK_META + ch_bam_peaks.combine(chrom_sizes) | GET_PEAK_META GET_PEAK_META.out.collect() | CONCAT_PEAK_META | PLOT_PEAK_WIDTHS ch_plots = PLOT_FRIP.out @@ -126,5 +125,6 @@ workflow CALL_PEAKS { emit: peaks = ch_peaks + bam_peaks = ch_bam_peaks plots = ch_plots } From 6332e0be0137879092576ceeea43c5c8e6599554 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 5 Dec 2023 16:20:09 -0500 Subject: [PATCH 09/35] feat: create module to prepare sample sheet for diffbind --- modules/local/diffbind/main.nf | 0 modules/local/diffbind/notebook/main.nf | 4 ++++ modules/local/diffbind/prep/main.nf | 27 ++++++++++++++++++++++ subworkflows/local/diff/main.nf | 30 ++++++++++++++++--------- 4 files changed, 50 insertions(+), 11 deletions(-) delete mode 100644 modules/local/diffbind/main.nf create mode 100644 modules/local/diffbind/notebook/main.nf create mode 100644 modules/local/diffbind/prep/main.nf diff --git a/modules/local/diffbind/main.nf b/modules/local/diffbind/main.nf deleted file mode 100644 index e69de29b..00000000 diff --git a/modules/local/diffbind/notebook/main.nf b/modules/local/diffbind/notebook/main.nf new file mode 100644 index 00000000..880ee5f0 --- /dev/null +++ b/modules/local/diffbind/notebook/main.nf @@ -0,0 +1,4 @@ +process DIFFBIND { + tag { meta.id } + label 'process_single' +} diff --git a/modules/local/diffbind/prep/main.nf b/modules/local/diffbind/prep/main.nf new file mode 100644 index 00000000..e8887a93 --- /dev/null +++ b/modules/local/diffbind/prep/main.nf @@ -0,0 +1,27 @@ +process PREP_DIFFBIND { + tag { "${metas[0].contrast}.${metas[0].tool}" } + label 'process_single' + + input: + tuple val(metas), path(bams), path(bais), path(peaks) + + output: + path("*.csv") + + script: + def csv_text = [['sampleID', "replicate", 'condition', 'bam', 'bai', 'peak']] + [metas, bams, bais, peaks] + .transpose() + .each { meta, bam, bai, peak -> + csv_text.add([meta.id, meta.rep, meta.group, bam, bai, peak]) + } + csv_text = csv_text*.join(',').join(System.lineSeparator) + """ + echo -ne "${csv_text}" > ${metas[0].contrast}.${metas[0].tool}.csv + """ + + stub: + """ + touch ${metas[0].contrast}.${metas[0].tool}.csv + """ +} diff --git a/subworkflows/local/diff/main.nf b/subworkflows/local/diff/main.nf index e22f4b76..3908fb18 100644 --- a/subworkflows/local/diff/main.nf +++ b/subworkflows/local/diff/main.nf @@ -1,8 +1,9 @@ include { CHECK_CONTRASTS } from "../../../modules/local/check_contrasts/" +include { PREP_DIFFBIND } from "../../../modules/local/diffbind/prep/" workflow DIFF { take: - peaks + bam_peaks samplesheet_file contrasts_file @@ -17,20 +18,27 @@ workflow DIFF { [ meta.sample_basename, [group: meta.group, contrast: meta.contrast] ] } .set{ contrasts } - peaks.map{ meta, bed -> - [ meta.sample_basename, meta, bed] + bam_peaks.map{ meta, bam, bai, peak, tool -> + [ meta.sample_basename, meta + [tool: tool], bam, bai, peak ] } - .cross( contrasts ) - .map{ it.flatten() } - .map{ sample_basename1, peak_meta, bed, sample_basename2, con_meta -> - assert sample_basename1 == sample_basename2 - meta = peak_meta + con_meta - [ meta, bed ] + .combine( contrasts ) + .map{ sample_basename1, peak_meta, bam, bai, peak, sample_basename2, con_meta -> + sample_basename1 == sample_basename2 ? [ peak_meta + con_meta, bam, bai, peak ] : null } + .unique() .set{ ch_peaks_contrasts } - + ch_peaks_contrasts + .map{ meta, bam, bai, peak -> + [ "${meta.contrast}.${meta.tool}", meta, bam, bai, peak ] + } + .groupTuple() + .map{ it -> // drop meta.contrast + it[1..-1] + } + .view() + | PREP_DIFFBIND emit: - diff_peaks = peaks + diff_peaks = bam_peaks } From dbd32c5fc33ec110f62661aec139a82a8849d35e Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 5 Dec 2023 16:29:03 -0500 Subject: [PATCH 10/35] feat: enforce contrast & tool must be the same --- modules/local/diffbind/prep/main.nf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/local/diffbind/prep/main.nf b/modules/local/diffbind/prep/main.nf index e8887a93..cded57b9 100644 --- a/modules/local/diffbind/prep/main.nf +++ b/modules/local/diffbind/prep/main.nf @@ -6,9 +6,12 @@ process PREP_DIFFBIND { tuple val(metas), path(bams), path(bais), path(peaks) output: - path("*.csv") + tuple val(new_meta), path("*.csv"), emit: csv script: + assert metas*.contrast.toSet().size() == 1 + assert metas*.tool.toSet().size() == 1 + new_meta = [contrast: metas[0].contrast, tool: metas[0].tool] def csv_text = [['sampleID', "replicate", 'condition', 'bam', 'bai', 'peak']] [metas, bams, bais, peaks] .transpose() From f09db390b535ee4c90416623bc906061dbf5b07b Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 5 Dec 2023 17:06:11 -0500 Subject: [PATCH 11/35] fix: include ctrl bam/bai for diffbind [wip] --- main.nf | 7 ++++++- modules/local/diffbind/prep/main.nf | 6 +++--- subworkflows/local/diff/main.nf | 19 +++++++------------ 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/main.nf b/main.nf index 3025cfb0..7e0f7ebd 100644 --- a/main.nf +++ b/main.nf @@ -121,7 +121,12 @@ workflow CHIPSEQ { if (params.contrasts) { contrasts = file(params.contrasts, checkIfExists: true) // TODO use consensus peaks for regions of interest in diffbind - DIFF( CALL_PEAKS.out.bam_peaks, INPUT_CHECK.out.csv, contrasts ) + bam_peaks = CALL_PEAKS.out.bam_peaks + .combine(deduped_bam) + .map{meta1, bam1, bai1, peak, tool, meta2, bam2, bai2 -> + meta1.control == meta2.id ? [ meta1.sample_basename, meta1 + [tool: tool], bam1, bai1, peak, bam2, bai2 ] : null + } + DIFF( bam_peaks, INPUT_CHECK.out.csv, contrasts ) } } diff --git a/modules/local/diffbind/prep/main.nf b/modules/local/diffbind/prep/main.nf index cded57b9..78170af5 100644 --- a/modules/local/diffbind/prep/main.nf +++ b/modules/local/diffbind/prep/main.nf @@ -3,7 +3,7 @@ process PREP_DIFFBIND { label 'process_single' input: - tuple val(metas), path(bams), path(bais), path(peaks) + tuple val(metas), path(bams), path(bais), path(peaks), path(ctrl_bams), path(ctrl_bais) output: tuple val(new_meta), path("*.csv"), emit: csv @@ -12,11 +12,11 @@ process PREP_DIFFBIND { assert metas*.contrast.toSet().size() == 1 assert metas*.tool.toSet().size() == 1 new_meta = [contrast: metas[0].contrast, tool: metas[0].tool] - def csv_text = [['sampleID', "replicate", 'condition', 'bam', 'bai', 'peak']] + def csv_text = [['SampleID', "Replicate", 'Condition', 'bamReads', "ControlID", "bamControl", 'Peaks', 'PeakCaller']] [metas, bams, bais, peaks] .transpose() .each { meta, bam, bai, peak -> - csv_text.add([meta.id, meta.rep, meta.group, bam, bai, peak]) + csv_text.add([meta.id, meta.rep, meta.group, bam, meta.control, ctrlbam, peak, meta.tool]) } csv_text = csv_text*.join(',').join(System.lineSeparator) """ diff --git a/subworkflows/local/diff/main.nf b/subworkflows/local/diff/main.nf index 3908fb18..c0817a8d 100644 --- a/subworkflows/local/diff/main.nf +++ b/subworkflows/local/diff/main.nf @@ -17,26 +17,21 @@ workflow DIFF { meta = get_contrast_meta(it) [ meta.sample_basename, [group: meta.group, contrast: meta.contrast] ] } + .unique() .set{ contrasts } - bam_peaks.map{ meta, bam, bai, peak, tool -> - [ meta.sample_basename, meta + [tool: tool], bam, bai, peak ] - } + bam_peaks .combine( contrasts ) - .map{ sample_basename1, peak_meta, bam, bai, peak, sample_basename2, con_meta -> - sample_basename1 == sample_basename2 ? [ peak_meta + con_meta, bam, bai, peak ] : null + .map{ sample_basename1, peak_meta, bam, bai, peak, ctrl_bam, ctrl_bai, sample_basename2, con_meta -> + sample_basename1 == sample_basename2 ? [ "${meta.contrast}.${meta.tool}", peak_meta + con_meta, bam, bai, peak, ctrl_bam, ctrl_bai ] : null } .unique() - .set{ ch_peaks_contrasts } - ch_peaks_contrasts - .map{ meta, bam, bai, peak -> - [ "${meta.contrast}.${meta.tool}", meta, bam, bai, peak ] - } .groupTuple() .map{ it -> // drop meta.contrast it[1..-1] } - .view() - | PREP_DIFFBIND + .set{ ch_peaks_contrasts } + ch_peaks_contrasts | view + ch_peaks_contrasts | PREP_DIFFBIND emit: diff_peaks = bam_peaks From bb9c77b05a4cda49a976f4ff43af3ebad41e19c5 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 5 Dec 2023 17:39:46 -0500 Subject: [PATCH 12/35] fix: use collectFile to avoid filename collisions in prep_diffbind --- main.nf | 3 ++- .../templates/check_contrasts.R | 4 ++- modules/local/diffbind/prep/main.nf | 26 ++++++++----------- subworkflows/local/diff/main.nf | 11 ++++---- 4 files changed, 21 insertions(+), 23 deletions(-) diff --git a/main.nf b/main.nf index 7e0f7ebd..99922987 100644 --- a/main.nf +++ b/main.nf @@ -121,11 +121,12 @@ workflow CHIPSEQ { if (params.contrasts) { contrasts = file(params.contrasts, checkIfExists: true) // TODO use consensus peaks for regions of interest in diffbind - bam_peaks = CALL_PEAKS.out.bam_peaks + CALL_PEAKS.out.bam_peaks .combine(deduped_bam) .map{meta1, bam1, bai1, peak, tool, meta2, bam2, bai2 -> meta1.control == meta2.id ? [ meta1.sample_basename, meta1 + [tool: tool], bam1, bai1, peak, bam2, bai2 ] : null } + .set{bam_peaks} DIFF( bam_peaks, INPUT_CHECK.out.csv, contrasts ) } diff --git a/modules/local/check_contrasts/templates/check_contrasts.R b/modules/local/check_contrasts/templates/check_contrasts.R index b75a09e5..19cadb99 100644 --- a/modules/local/check_contrasts/templates/check_contrasts.R +++ b/modules/local/check_contrasts/templates/check_contrasts.R @@ -23,13 +23,15 @@ main <- function(contrasts_filename = "${contrasts}", ) } -#' Ensure contrast has exactly two groups to compare +#' Check an individual contrast comparison check_contrast <- function(contrast_name, contrasts_lst) { + # Ensure contrast has exactly two groups to compare contrast_len <- length(contrasts_lst[[contrast_name]]) assert_that( contrast_len == 2, msg = glue("Contrasts must have only two groups per comparison, but {contrast_name} has {contrast_len}") ) + # TODO: check that each sample only appears in one group } #' Combine sample sheet with contrast group diff --git a/modules/local/diffbind/prep/main.nf b/modules/local/diffbind/prep/main.nf index 78170af5..224e98ee 100644 --- a/modules/local/diffbind/prep/main.nf +++ b/modules/local/diffbind/prep/main.nf @@ -1,30 +1,26 @@ process PREP_DIFFBIND { - tag { "${metas[0].contrast}.${metas[0].tool}" } + tag { "${meta.id}.${meta.contrast}.${meta.tool}" } label 'process_single' + container "${params.containers.base}" + input: - tuple val(metas), path(bams), path(bais), path(peaks), path(ctrl_bams), path(ctrl_bais) + tuple val(meta), path(bam), path(bai), path(peak), path(ctrl_bam), path(ctrl_bai) output: - tuple val(new_meta), path("*.csv"), emit: csv + tuple val(meta), path("*.csv"), emit: csv script: - assert metas*.contrast.toSet().size() == 1 - assert metas*.tool.toSet().size() == 1 - new_meta = [contrast: metas[0].contrast, tool: metas[0].tool] - def csv_text = [['SampleID', "Replicate", 'Condition', 'bamReads', "ControlID", "bamControl", 'Peaks', 'PeakCaller']] - [metas, bams, bais, peaks] - .transpose() - .each { meta, bam, bai, peak -> - csv_text.add([meta.id, meta.rep, meta.group, bam, meta.control, ctrlbam, peak, meta.tool]) - } - csv_text = csv_text*.join(',').join(System.lineSeparator) + def csv_text = [ + ['SampleID', "Replicate", 'Condition', 'bamReads', "ControlID", "bamControl", 'Peaks', 'PeakCaller'], + [meta.id, meta.rep, meta.group, bam, meta.control, ctrl_bam, peak, meta.tool] + ]*.join(',').join(System.lineSeparator) """ - echo -ne "${csv_text}" > ${metas[0].contrast}.${metas[0].tool}.csv + echo -ne "${csv_text}" > ${meta.contrast}.${meta.tool}.${meta.id}.csv """ stub: """ - touch ${metas[0].contrast}.${metas[0].tool}.csv + touch ${meta.contrast}.${meta.tool}.${meta.id}.csv """ } diff --git a/subworkflows/local/diff/main.nf b/subworkflows/local/diff/main.nf index c0817a8d..133f7f3e 100644 --- a/subworkflows/local/diff/main.nf +++ b/subworkflows/local/diff/main.nf @@ -22,16 +22,15 @@ workflow DIFF { bam_peaks .combine( contrasts ) .map{ sample_basename1, peak_meta, bam, bai, peak, ctrl_bam, ctrl_bai, sample_basename2, con_meta -> - sample_basename1 == sample_basename2 ? [ "${meta.contrast}.${meta.tool}", peak_meta + con_meta, bam, bai, peak, ctrl_bam, ctrl_bai ] : null + sample_basename1 == sample_basename2 ? [ peak_meta + con_meta, bam, bai, peak, ctrl_bam, ctrl_bai ] : null } .unique() - .groupTuple() - .map{ it -> // drop meta.contrast - it[1..-1] - } .set{ ch_peaks_contrasts } - ch_peaks_contrasts | view ch_peaks_contrasts | PREP_DIFFBIND + PREP_DIFFBIND.out.csv + .collectFile(storeDir: "${params.outdir}/diffbind/contrasts") { meta, row -> + [ "${meta.contrast}.${meta.tool}.csv", row + '\n' ] + } emit: diff_peaks = bam_peaks From 9a9d69679ec698fb50726c8a3b769c828f482005 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 5 Dec 2023 17:42:21 -0500 Subject: [PATCH 13/35] feat: install nf-core/rmarkdownnotebook process --- modules.json | 5 + .../nf-core/rmarkdownnotebook/environment.yml | 9 ++ modules/nf-core/rmarkdownnotebook/main.nf | 135 ++++++++++++++++++ modules/nf-core/rmarkdownnotebook/meta.yml | 70 +++++++++ .../nf-core/rmarkdownnotebook/parametrize.nf | 36 +++++ 5 files changed, 255 insertions(+) create mode 100644 modules/nf-core/rmarkdownnotebook/environment.yml create mode 100644 modules/nf-core/rmarkdownnotebook/main.nf create mode 100644 modules/nf-core/rmarkdownnotebook/meta.yml create mode 100644 modules/nf-core/rmarkdownnotebook/parametrize.nf diff --git a/modules.json b/modules.json index 29998895..e1eaa684 100644 --- a/modules.json +++ b/modules.json @@ -99,6 +99,11 @@ "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", "installed_by": ["modules"] + }, + "rmarkdownnotebook": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] } } } diff --git a/modules/nf-core/rmarkdownnotebook/environment.yml b/modules/nf-core/rmarkdownnotebook/environment.yml new file mode 100644 index 00000000..c7fe8cf2 --- /dev/null +++ b/modules/nf-core/rmarkdownnotebook/environment.yml @@ -0,0 +1,9 @@ +name: rmarkdownnotebook +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::r-base=4.1.0 + - conda-forge::r-rmarkdown=2.9 + - conda-forge::r-yaml=2.2.1 diff --git a/modules/nf-core/rmarkdownnotebook/main.nf b/modules/nf-core/rmarkdownnotebook/main.nf new file mode 100644 index 00000000..9e6bc8cf --- /dev/null +++ b/modules/nf-core/rmarkdownnotebook/main.nf @@ -0,0 +1,135 @@ +include { dump_params_yml; indent_code_block } from "./parametrize" + +process RMARKDOWNNOTEBOOK { + tag "$meta.id" + label 'process_low' + + //NB: You likely want to override this with a container containing all required + //dependencies for your analysis. The container at least needs to contain the + //yaml and rmarkdown R packages. + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-31ad840d814d356e5f98030a4ee308a16db64ec5:0e852a1e4063fdcbe3f254ac2c7469747a60e361-0' : + 'biocontainers/mulled-v2-31ad840d814d356e5f98030a4ee308a16db64ec5:0e852a1e4063fdcbe3f254ac2c7469747a60e361-0' }" + + input: + tuple val(meta), path(notebook) + val parameters + path input_files + + output: + tuple val(meta), path("*.html") , emit: report + tuple val(meta), path("*.parameterised.Rmd") , emit: parameterised_notebook, optional: true + tuple val(meta), path ("artifacts/*") , emit: artifacts, optional: true + tuple val(meta), path ("session_info.log") , emit: session_info + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def parametrize = (task.ext.parametrize == null) ? true : task.ext.parametrize + def implicit_params = (task.ext.implicit_params == null) ? true : task.ext.implicit_params + def meta_params = (task.ext.meta_params == null) ? true : task.ext.meta_params + + // Dump parameters to yaml file. + // Using a yaml file over using the CLI params because + // * no issue with escaping + // * allows to pass nested maps instead of just single values + def params_cmd = "" + def render_cmd = "" + if (parametrize) { + nb_params = [:] + if (implicit_params) { + nb_params["cpus"] = task.cpus + nb_params["artifact_dir"] = "artifacts" + nb_params["input_dir"] = "./" + } + if (meta_params) { + nb_params["meta"] = meta + } + nb_params += parameters + params_cmd = dump_params_yml(nb_params) + render_cmd = """\ + params = yaml::read_yaml('.params.yml') + + # Instead of rendering with params, produce a version of the R + # markdown with param definitions set, so the notebook itself can + # be reused + rmd_content <- readLines('${prefix}.Rmd') + + # Extract YAML content between the first two '---' + start_idx <- which(rmd_content == "---")[1] + end_idx <- which(rmd_content == "---")[2] + rmd_yaml_content <- paste(rmd_content[(start_idx+1):(end_idx-1)], collapse = "\\n") + rmd_params <- yaml::yaml.load(rmd_yaml_content) + + # Override the params + rmd_params[['params']] <- modifyList(rmd_params[['params']], params) + + # Recursive function to add 'value' to list elements, except for top-level + add_value_recursively <- function(lst, is_top_level = FALSE) { + if (!is.list(lst)) { + return(lst) + } + + lst <- lapply(lst, add_value_recursively) + if (!is_top_level) { + lst <- list(value = lst) + } + return(lst) + } + + # Reformat nested lists under 'params' to have a 'value' key recursively + rmd_params[['params']] <- add_value_recursively(rmd_params[['params']], is_top_level = TRUE) + + # Convert back to YAML string + updated_yaml_content <- as.character(yaml::as.yaml(rmd_params)) + + # Remove the old YAML content + rmd_content <- rmd_content[-((start_idx+1):(end_idx-1))] + + # Insert the updated YAML content at the right position + rmd_content <- append(rmd_content, values = unlist(strsplit(updated_yaml_content, split = "\\n")), after = start_idx) + + writeLines(rmd_content, '${prefix}.parameterised.Rmd') + + # Render based on the updated file + rmarkdown::render('${prefix}.parameterised.Rmd', output_file='${prefix}.html', envir = new.env()) + """ + } else { + render_cmd = "rmarkdown::render('${prefix}.Rmd', output_file='${prefix}.html')" + } + + """ + # Dump .params.yml heredoc (section will be empty if parametrization is disabled) + ${indent_code_block(params_cmd, 4)} + + # Create output directory + mkdir artifacts + + # Set parallelism for BLAS/MKL etc. to avoid over-booking of resources + export MKL_NUM_THREADS="$task.cpus" + export OPENBLAS_NUM_THREADS="$task.cpus" + export OMP_NUM_THREADS="$task.cpus" + + # Work around https://github.com/rstudio/rmarkdown/issues/1508 + # If the symbolic link is not replaced by a physical file + # output- and temporary files will be written to the original directory. + mv "${notebook}" "${notebook}.orig" + cp -L "${notebook}.orig" "${prefix}.Rmd" + + # Render notebook + Rscript - < versions.yml + "${task.process}": + rmarkdown: \$(Rscript -e "cat(paste(packageVersion('rmarkdown'), collapse='.'))") + END_VERSIONS + """ +} diff --git a/modules/nf-core/rmarkdownnotebook/meta.yml b/modules/nf-core/rmarkdownnotebook/meta.yml new file mode 100644 index 00000000..f84f0682 --- /dev/null +++ b/modules/nf-core/rmarkdownnotebook/meta.yml @@ -0,0 +1,70 @@ +name: rmarkdownnotebook +description: Render an rmarkdown notebook. Supports parametrization. +keywords: + - R + - notebook + - reports +tools: + - rmarkdown: + description: Dynamic Documents for R + homepage: https://rmarkdown.rstudio.com/ + documentation: https://rmarkdown.rstudio.com/lesson-1.html + tool_dev_url: https://github.com/rstudio/rmarkdown + licence: GPL-3 +params: + - parametrize: + type: boolean + description: If true, parametrize the notebook + - implicit_params: + type: boolean + description: | + If true (default), include the implicit params + * `input_dir`, which points to the directory containing the files added via `input_files`, + * `artifact_dir`, which points to the directory where the notebook should place output files, and + * `cpus`, which contains the value of ${task.cpus} + - meta_params: + type: boolean + description: | + If true, include a parameter `meta` which contains the information specified + via the `meta` input channel. +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - notebook: + type: file + description: Rmarkdown file + pattern: "*.{Rmd}" + - parameters: + type: map + description: | + Groovy map with notebook parameters which will be passed to + rmarkdown to generate parametrized reports. + - input_files: + type: file + description: One or multiple files serving as input data for the notebook. + pattern: "*" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - report: + type: file + description: HTML report generated from Rmarkdown + pattern: "*.html" + - session_info: + type: file + description: dump of R SessionInfo + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@grst" +maintainers: + - "@grst" diff --git a/modules/nf-core/rmarkdownnotebook/parametrize.nf b/modules/nf-core/rmarkdownnotebook/parametrize.nf new file mode 100644 index 00000000..05e259eb --- /dev/null +++ b/modules/nf-core/rmarkdownnotebook/parametrize.nf @@ -0,0 +1,36 @@ +import org.yaml.snakeyaml.Yaml +import org.yaml.snakeyaml.DumperOptions + + +/** + * Multiline code blocks need to have the same indentation level + * as the `script:` section. This function re-indents code to the specified level. + */ +def indent_code_block(code, n_spaces) { + def indent_str = " ".multiply(n_spaces) + return code.stripIndent().split("\n").join("\n" + indent_str) +} + +/** + * Create a config YAML file from a groovy map + * + * @params task The process' `task` variable + * @returns a line to be inserted in the bash script. + */ +def dump_params_yml(params) { + DumperOptions options = new DumperOptions(); + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK); + def yaml = new Yaml(options) + def yaml_str = yaml.dump(params) + + // Writing the .params.yml file directly as follows does not work. + // It only works in 'exec:', but not if there is a `script:` section: + // task.workDir.resolve('.params.yml').text = yaml_str + + // Therefore, we inject it into the bash script: + return """\ + cat <<"END_PARAMS_SECTION" > ./.params.yml + ${indent_code_block(yaml_str, 8)} + END_PARAMS_SECTION + """ +} From c8c384a128d5ac48cff15d4a591663bc4b055df5 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Wed, 6 Dec 2023 11:05:56 -0500 Subject: [PATCH 14/35] fix: error in collectFile https://github.com/nextflow-io/nextflow/discussions/4559 --- subworkflows/local/diff/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/diff/main.nf b/subworkflows/local/diff/main.nf index 133f7f3e..8d851b38 100644 --- a/subworkflows/local/diff/main.nf +++ b/subworkflows/local/diff/main.nf @@ -29,7 +29,7 @@ workflow DIFF { ch_peaks_contrasts | PREP_DIFFBIND PREP_DIFFBIND.out.csv .collectFile(storeDir: "${params.outdir}/diffbind/contrasts") { meta, row -> - [ "${meta.contrast}.${meta.tool}.csv", row + '\n' ] + [ "${meta.contrast}.${meta.tool}.csv", row ] } emit: diff_peaks = bam_peaks From 54a5f99c6f78df1643556bbd3c1b5b856cf62095 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 7 Dec 2023 14:46:26 -0500 Subject: [PATCH 15/35] feat: render diffbind report --- assets/diffbind_report.Rmd | 189 ++++++++++++++++++++++++++++++++ bin/DiffBind_pipeliner.Rmd | 17 +-- nextflow.config | 5 + subworkflows/local/diff/main.nf | 40 ++++++- 4 files changed, 236 insertions(+), 15 deletions(-) create mode 100644 assets/diffbind_report.Rmd diff --git a/assets/diffbind_report.Rmd b/assets/diffbind_report.Rmd new file mode 100644 index 00000000..4dc3555e --- /dev/null +++ b/assets/diffbind_report.Rmd @@ -0,0 +1,189 @@ +--- +title: "DiffBind: CCBR ChIP-seq pipeline" +output: + html_document: + toc: true + toc_depth: 2 +params: + csvfile: samplesheet.csv + contrasts: "group1_vs_group2" + peakcaller: "macs" +--- + + + +```{r, include=FALSE, warning=FALSE, message=FALSE} +dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") + +csvfile <- params$csvfile +contrasts <- params$contrasts +peakcaller <- params$peakcaller +``` + +### **Groups being compared:** +#### *`r contrasts`* +### **Peak sources:** +#### *`r peakcaller`* +### **Report generated:** +#### *`r dateandtime`* + +```{r setup, echo=FALSE, warning=FALSE,message=FALSE} +knitr::opts_chunk$set(echo = TRUE) +suppressMessages(library(DT)) +suppressMessages(library(DiffBind)) +``` + +
+ +## Read in sample sheet information and peak information +```{r samples, echo=FALSE, warning=FALSE,message=FALSE} +samples <- dba(sampleSheet = csvfile) +consensus <- dba.peakset(samples, consensus = DBA_CONDITION) +print(samples) +``` + +
+ +## Plot raw information about the peaks +### Correlation heatmap: Only peaks +```{r heatmap1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} +try(plot(samples, main = ""), silent = TRUE) +``` + +### PCA: Only peaks +```{r PCA1, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center",fig.caption="PCA:\nOnlyPeaks"} +try(dba.plotPCA(samples, DBA_CONDITION), silent = TRUE) +``` + +### Overlapping peak counts +```{r Venn, echo=FALSE, warning=FALSE,message=FALSE,fig.align="center",fig.height=5,fig.width=5} +if (nrow(samples$samples) < 5) { + dba.plotVenn(samples, 1:nrow(samples$samples)) +} else { + dba.plotVenn(samples, samples$masks[[3]]) + dba.plotVenn(samples, samples$masks[[4]]) + dba.plotVenn(consensus, consensus$masks$Consensus) +} +``` + +```{r peaksORsummits, echo=F} +if (grepl("narrow", samples$samples$Peaks[1])) { + summits <- TRUE + print("Narrow peak calling tool.") + print("Differential peaks are 250bp upstream and downstream of the summits.") +} else if (grepl("broad", samples$samples$Peaks[1])) { + summits <- FALSE + print("Broad peak calling tool.") + print("Differential peaks are consensus peaks.") +} else { + summits <- FALSE + print("Indeterminate peak calling tool.") + print("Differential peaks are consensus peaks.") +} +``` + +## Read in bam file information under all peaks found in at least two samples +```{r DBcount, echo=FALSE, warning=FALSE,message=FALSE} +if (summits == TRUE) { + DBdataCounts <- dba.count(samples, summits = 250) +} else { + DBdataCounts <- dba.count(samples) +} +print(DBdataCounts) +``` + +
+ +## Plot raw information about all analyzed peaks +### Correlation heatmap: Peaks and reads +```{r heatmap2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} +try(plot(DBdataCounts, main = ""), silent = TRUE) +``` + +### Heatmap: Average signal across each peak +```{r heatmap3, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} +try(dba.plotHeatmap(DBdataCounts, correlations = FALSE), silent = TRUE) +``` + +### PCA: Peaks and reads +```{r PCA2, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"} +try(dba.plotPCA(DBdataCounts, DBA_CONDITION), silent = TRUE) +``` + +## Associate individual samples with the different contrasts +```{r contrast, echo=FALSE, warning=FALSE,message=FALSE} +DBdatacontrast <- dba.contrast(DBdataCounts, minMembers = 2, categories = DBA_CONDITION) +print(DBdatacontrast) +``` + +
+ +## Call differential peaks using Deseq2 and EdgeR +```{r analyze, echo=FALSE, warning=FALSE,message=FALSE} +DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) +DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) +``` + +```{r, echo=FALSE, warning=FALSE,message=FALSE} +DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2) +DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER) +``` + +### PCA: DeSeq2 +```{r PCA3, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"} +try(dba.plotPCA(DBAnalysisDeseq2, contrast = 1, method = DBA_DESEQ2), silent = TRUE) +``` + +### PCA: EdgeR +```{r PCA4, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"} +try(dba.plotPCA(DBAnalysisEdgeR, contrast = 1, method = DBA_EDGER), silent = TRUE) +``` + +### MANorm: (left) Deseq2, (right) EdgeR +```{r MA, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"} +par(mfcol = c(1, 2)) +try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2), silent = TRUE) +try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER), silent = TRUE) +``` + +### Volcano plot: DeSeq2 +```{r Volcano1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} +try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2), silent = TRUE) +``` + +### Volcano plot: EdgeR +```{r Volcano2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} +try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER), silent = TRUE) +``` + +### Boxplots: (left) Deseq2, (right) EdgeR +```{r BoxPlot, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"} +par(mfcol = c(1, 2)) +if (length(DBReportDeseq2) > 0) { + try(dba.plotBox(DBAnalysisDeseq2, method = DBA_DESEQ2), silent = TRUE) +} else { + plot(0, type = "n", axes = FALSE, ann = FALSE) +} +try(dba.plotBox(DBAnalysisEdgeR, method = DBA_EDGER), silent = TRUE) +``` + +## Differentially bound peaks: Deseq2 output +```{r Deseq2Report, echo=FALSE, warning=FALSE,message=FALSE} +outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.txt") +write.table(DBReportDeseq2, outfile, quote = F, sep = "\t", row.names = F) +DT::datatable(data.frame(DBReportDeseq2), rownames = F) +``` + +## Differentially bound peaks: EdgeR output +```{r EdgeRReport, echo=FALSE, warning=FALSE,message=FALSE} +outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_EdgeR.txt") +write.table(DBReportEdgeR, outfile, quote = F, sep = "\t", row.names = F) +DT::datatable(data.frame(DBReportEdgeR), rownames = F) +``` + +## R tool version information +```{r Info, echo=FALSE, message=FALSE, warning=FALSE} +sessionInfo() +``` diff --git a/bin/DiffBind_pipeliner.Rmd b/bin/DiffBind_pipeliner.Rmd index f4d75622..0e88d4c7 100644 --- a/bin/DiffBind_pipeliner.Rmd +++ b/bin/DiffBind_pipeliner.Rmd @@ -6,10 +6,8 @@ output: toc_depth: 2 params: csvfile: samplesheet.csv - contrasts: "group1_vs_group2" - peakcaller: "macs" - projectID: "" - projectDesc: "" + contrast: "group1_vs_group2" + tool: "macs" --- ```{r, include=FALSE, warning=FALSE, message=FALSE} -## grab args -projectID <- params$projectID -projectDesc <- params$projectDesc dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") csvfile <- params$csvfile -contrasts <- params$contrasts -peakcaller <- params$peakcaller +contrasts <- params$contrast +peakcaller <- params$tool ``` -### **Project:** -#### *`r projectID`* -### **Description:** -#### *`r projectDesc`* ### **Groups being compared:** #### *`r contrasts`* ### **Peak sources:** diff --git a/nextflow.config b/nextflow.config index 9089522a..dabd3e81 100644 --- a/nextflow.config +++ b/nextflow.config @@ -69,6 +69,11 @@ params { de_novo = true jaspar_db = 'assets/JASPAR2022_CORE_vertebrates_non-redundant_pfms_jaspar.txt' // source https://jaspar.genereg.net/download/data/2022/CORE/JASPAR2022_CORE_vertebrates_non-redundant_pfms_jaspar.txt } + + diffbind { + report = "${projectDir}/assets/diffbind_report.Rmd" + } + run { // some steps can be turned on/off for debugging purposes qc = true deeptools = true diff --git a/subworkflows/local/diff/main.nf b/subworkflows/local/diff/main.nf index 8d851b38..eebac513 100644 --- a/subworkflows/local/diff/main.nf +++ b/subworkflows/local/diff/main.nf @@ -1,5 +1,6 @@ -include { CHECK_CONTRASTS } from "../../../modules/local/check_contrasts/" -include { PREP_DIFFBIND } from "../../../modules/local/diffbind/prep/" +include { CHECK_CONTRASTS } from "../../../modules/local/check_contrasts/" +include { PREP_DIFFBIND } from "../../../modules/local/diffbind/prep/" +include { RMARKDOWNNOTEBOOK } from '../../../modules/nf-core/rmarkdownnotebook/' workflow DIFF { take: @@ -27,10 +28,45 @@ workflow DIFF { .unique() .set{ ch_peaks_contrasts } ch_peaks_contrasts | PREP_DIFFBIND + + ch_peaks_contrasts + .map{ meta, bam, bai, peak, ctrl_bam, ctrl_bai -> + [ [contrast: meta.contrast, tool: meta.tool], [bam, bai, peak, ctrl_bam, ctrl_bai] ] + } + .groupTuple() + .map{ meta, files -> + [ meta, files.flatten().unique() ] + } + .set{ data_files } + PREP_DIFFBIND.out.csv .collectFile(storeDir: "${params.outdir}/diffbind/contrasts") { meta, row -> [ "${meta.contrast}.${meta.tool}.csv", row ] } + .map{ contrast_file -> + meta = [:] + meta_list = contrast_file.baseName.tokenize('.') + meta.contrast = meta_list[0] + meta.tool = meta_list[1] + [ meta, contrast_file ] + } + .cross(data_files) + .map{ contrast, files -> + meta = contrast[0] + meta.csvfile = contrast[1] + file_list = [contrast[1], files[1]].flatten().unique() + [ meta, file_list ] // meta, csv, filelist + } + .set{contrast_files} + contrast_files + .combine(Channel.fromPath(file(params.diffbind.report, checkIfExists: true))) + .map{ meta, files, rmarkdown -> + [ meta, rmarkdown ] + } + .set{ch_rmarkdown} + contrast_files.map{ meta, files -> files}.flatten().unique().collect().set{file_list} + RMARKDOWNNOTEBOOK( ch_rmarkdown, [], file_list ) + emit: diff_peaks = bam_peaks From 11429ab47bfd9bee9a99a287490bd5e94f8fffdc Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 7 Dec 2023 15:58:00 -0500 Subject: [PATCH 16/35] refactor: move diffbind rmd to assets/ --- assets/diffbind_report.Rmd | 8 +- bin/DiffBind_pipeliner.Rmd | 189 ------------------------------------- 2 files changed, 4 insertions(+), 193 deletions(-) delete mode 100644 bin/DiffBind_pipeliner.Rmd diff --git a/assets/diffbind_report.Rmd b/assets/diffbind_report.Rmd index 4dc3555e..0e88d4c7 100644 --- a/assets/diffbind_report.Rmd +++ b/assets/diffbind_report.Rmd @@ -6,8 +6,8 @@ output: toc_depth: 2 params: csvfile: samplesheet.csv - contrasts: "group1_vs_group2" - peakcaller: "macs" + contrast: "group1_vs_group2" + tool: "macs" --- - -```{r, include=FALSE, warning=FALSE, message=FALSE} -dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") - -csvfile <- params$csvfile -contrasts <- params$contrast -peakcaller <- params$tool -``` - -### **Groups being compared:** -#### *`r contrasts`* -### **Peak sources:** -#### *`r peakcaller`* -### **Report generated:** -#### *`r dateandtime`* - -```{r setup, echo=FALSE, warning=FALSE,message=FALSE} -knitr::opts_chunk$set(echo = TRUE) -suppressMessages(library(DT)) -suppressMessages(library(DiffBind)) -``` - -
- -## Read in sample sheet information and peak information -```{r samples, echo=FALSE, warning=FALSE,message=FALSE} -samples <- dba(sampleSheet = csvfile) -consensus <- dba.peakset(samples, consensus = DBA_CONDITION) -print(samples) -``` - -
- -## Plot raw information about the peaks -### Correlation heatmap: Only peaks -```{r heatmap1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} -try(plot(samples, main = ""), silent = TRUE) -``` - -### PCA: Only peaks -```{r PCA1, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center",fig.caption="PCA:\nOnlyPeaks"} -try(dba.plotPCA(samples, DBA_CONDITION), silent = TRUE) -``` - -### Overlapping peak counts -```{r Venn, echo=FALSE, warning=FALSE,message=FALSE,fig.align="center",fig.height=5,fig.width=5} -if (nrow(samples$samples) < 5) { - dba.plotVenn(samples, 1:nrow(samples$samples)) -} else { - dba.plotVenn(samples, samples$masks[[3]]) - dba.plotVenn(samples, samples$masks[[4]]) - dba.plotVenn(consensus, consensus$masks$Consensus) -} -``` - -```{r peaksORsummits, echo=F} -if (grepl("narrow", samples$samples$Peaks[1])) { - summits <- TRUE - print("Narrow peak calling tool.") - print("Differential peaks are 250bp upstream and downstream of the summits.") -} else if (grepl("broad", samples$samples$Peaks[1])) { - summits <- FALSE - print("Broad peak calling tool.") - print("Differential peaks are consensus peaks.") -} else { - summits <- FALSE - print("Indeterminate peak calling tool.") - print("Differential peaks are consensus peaks.") -} -``` - -## Read in bam file information under all peaks found in at least two samples -```{r DBcount, echo=FALSE, warning=FALSE,message=FALSE} -if (summits == TRUE) { - DBdataCounts <- dba.count(samples, summits = 250) -} else { - DBdataCounts <- dba.count(samples) -} -print(DBdataCounts) -``` - -
- -## Plot raw information about all analyzed peaks -### Correlation heatmap: Peaks and reads -```{r heatmap2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} -try(plot(DBdataCounts, main = ""), silent = TRUE) -``` - -### Heatmap: Average signal across each peak -```{r heatmap3, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} -try(dba.plotHeatmap(DBdataCounts, correlations = FALSE), silent = TRUE) -``` - -### PCA: Peaks and reads -```{r PCA2, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"} -try(dba.plotPCA(DBdataCounts, DBA_CONDITION), silent = TRUE) -``` - -## Associate individual samples with the different contrasts -```{r contrast, echo=FALSE, warning=FALSE,message=FALSE} -DBdatacontrast <- dba.contrast(DBdataCounts, minMembers = 2, categories = DBA_CONDITION) -print(DBdatacontrast) -``` - -
- -## Call differential peaks using Deseq2 and EdgeR -```{r analyze, echo=FALSE, warning=FALSE,message=FALSE} -DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) -DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) -``` - -```{r, echo=FALSE, warning=FALSE,message=FALSE} -DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2) -DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER) -``` - -### PCA: DeSeq2 -```{r PCA3, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"} -try(dba.plotPCA(DBAnalysisDeseq2, contrast = 1, method = DBA_DESEQ2), silent = TRUE) -``` - -### PCA: EdgeR -```{r PCA4, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"} -try(dba.plotPCA(DBAnalysisEdgeR, contrast = 1, method = DBA_EDGER), silent = TRUE) -``` - -### MANorm: (left) Deseq2, (right) EdgeR -```{r MA, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"} -par(mfcol = c(1, 2)) -try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2), silent = TRUE) -try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER), silent = TRUE) -``` - -### Volcano plot: DeSeq2 -```{r Volcano1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} -try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2), silent = TRUE) -``` - -### Volcano plot: EdgeR -```{r Volcano2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} -try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER), silent = TRUE) -``` - -### Boxplots: (left) Deseq2, (right) EdgeR -```{r BoxPlot, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"} -par(mfcol = c(1, 2)) -if (length(DBReportDeseq2) > 0) { - try(dba.plotBox(DBAnalysisDeseq2, method = DBA_DESEQ2), silent = TRUE) -} else { - plot(0, type = "n", axes = FALSE, ann = FALSE) -} -try(dba.plotBox(DBAnalysisEdgeR, method = DBA_EDGER), silent = TRUE) -``` - -## Differentially bound peaks: Deseq2 output -```{r Deseq2Report, echo=FALSE, warning=FALSE,message=FALSE} -outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.txt") -write.table(DBReportDeseq2, outfile, quote = F, sep = "\t", row.names = F) -DT::datatable(data.frame(DBReportDeseq2), rownames = F) -``` - -## Differentially bound peaks: EdgeR output -```{r EdgeRReport, echo=FALSE, warning=FALSE,message=FALSE} -outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_EdgeR.txt") -write.table(DBReportEdgeR, outfile, quote = F, sep = "\t", row.names = F) -DT::datatable(data.frame(DBReportEdgeR), rownames = F) -``` - -## R tool version information -```{r Info, echo=FALSE, message=FALSE, warning=FALSE} -sessionInfo() -``` From b9b441aa1dc16744c30a356c823574c856642a67 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 7 Dec 2023 16:15:37 -0500 Subject: [PATCH 17/35] refactor: improve channel operations for diffbind rmd --- subworkflows/local/diff/main.nf | 40 ++++++++++++++------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/subworkflows/local/diff/main.nf b/subworkflows/local/diff/main.nf index eebac513..dcdba3b7 100644 --- a/subworkflows/local/diff/main.nf +++ b/subworkflows/local/diff/main.nf @@ -29,16 +29,6 @@ workflow DIFF { .set{ ch_peaks_contrasts } ch_peaks_contrasts | PREP_DIFFBIND - ch_peaks_contrasts - .map{ meta, bam, bai, peak, ctrl_bam, ctrl_bai -> - [ [contrast: meta.contrast, tool: meta.tool], [bam, bai, peak, ctrl_bam, ctrl_bai] ] - } - .groupTuple() - .map{ meta, files -> - [ meta, files.flatten().unique() ] - } - .set{ data_files } - PREP_DIFFBIND.out.csv .collectFile(storeDir: "${params.outdir}/diffbind/contrasts") { meta, row -> [ "${meta.contrast}.${meta.tool}.csv", row ] @@ -48,24 +38,28 @@ workflow DIFF { meta_list = contrast_file.baseName.tokenize('.') meta.contrast = meta_list[0] meta.tool = meta_list[1] + meta.csvfile = contrast_file.baseName [ meta, contrast_file ] } - .cross(data_files) - .map{ contrast, files -> - meta = contrast[0] - meta.csvfile = contrast[1] - file_list = [contrast[1], files[1]].flatten().unique() - [ meta, file_list ] // meta, csv, filelist + .tap{contrast_file} // [ meta, contrast] + .map{ meta, file -> meta } + .set{contrast_meta} // [ meta ] + + ch_peaks_contrasts + .map{ meta, bam, bai, peak, ctrl_bam, ctrl_bai -> + [bam, bai, peak, ctrl_bam, ctrl_bai] } - .set{contrast_files} - contrast_files + .mix(contrast_file.map{meta,csv->csv}) + .flatten() + .unique() + .collect() + .set{ ch_data_files } + + contrast_meta .combine(Channel.fromPath(file(params.diffbind.report, checkIfExists: true))) - .map{ meta, files, rmarkdown -> - [ meta, rmarkdown ] - } .set{ch_rmarkdown} - contrast_files.map{ meta, files -> files}.flatten().unique().collect().set{file_list} - RMARKDOWNNOTEBOOK( ch_rmarkdown, [], file_list ) + + RMARKDOWNNOTEBOOK( ch_rmarkdown, [], ch_data_files ) emit: diff_peaks = bam_peaks From 30673c9acecc2c833a92b539c9a3a5dbb2839b21 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 7 Dec 2023 16:22:56 -0500 Subject: [PATCH 18/35] feat: check that each sample only appears in one group --- .../local/check_contrasts/templates/check_contrasts.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/modules/local/check_contrasts/templates/check_contrasts.R b/modules/local/check_contrasts/templates/check_contrasts.R index 19cadb99..cf44e192 100644 --- a/modules/local/check_contrasts/templates/check_contrasts.R +++ b/modules/local/check_contrasts/templates/check_contrasts.R @@ -25,13 +25,18 @@ main <- function(contrasts_filename = "${contrasts}", #' Check an individual contrast comparison check_contrast <- function(contrast_name, contrasts_lst) { + curr_contrast <- contrasts_lst[[contrast_name]] # Ensure contrast has exactly two groups to compare - contrast_len <- length(contrasts_lst[[contrast_name]]) + contrast_len <- length(curr_contrast) assert_that( contrast_len == 2, msg = glue("Contrasts must have only two groups per comparison, but {contrast_name} has {contrast_len}") ) - # TODO: check that each sample only appears in one group + group_names <- names(curr_contrast) + assert_that( + length(intersect(curr_contrast[[group_names[1]]], curr_contrast[[group_names[2]]])) == 0, + msg = glue("Each sample can only appear in one group per contrast ({contrast_name})") + ) } #' Combine sample sheet with contrast group From 39b2cf910e7251bf8af1b39d500e6f6ac891fe58 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 7 Dec 2023 16:37:29 -0500 Subject: [PATCH 19/35] feat: patch rmarkdownnotebook w/ custom container & stub --- modules.json | 3 +- modules/nf-core/rmarkdownnotebook/main.nf | 20 ++++++---- .../rmarkdownnotebook/rmarkdownnotebook.diff | 37 +++++++++++++++++++ 3 files changed, 52 insertions(+), 8 deletions(-) create mode 100644 modules/nf-core/rmarkdownnotebook/rmarkdownnotebook.diff diff --git a/modules.json b/modules.json index e1eaa684..1fe341e7 100644 --- a/modules.json +++ b/modules.json @@ -103,7 +103,8 @@ "rmarkdownnotebook": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/rmarkdownnotebook/rmarkdownnotebook.diff" } } } diff --git a/modules/nf-core/rmarkdownnotebook/main.nf b/modules/nf-core/rmarkdownnotebook/main.nf index 9e6bc8cf..431cc316 100644 --- a/modules/nf-core/rmarkdownnotebook/main.nf +++ b/modules/nf-core/rmarkdownnotebook/main.nf @@ -4,13 +4,7 @@ process RMARKDOWNNOTEBOOK { tag "$meta.id" label 'process_low' - //NB: You likely want to override this with a container containing all required - //dependencies for your analysis. The container at least needs to contain the - //yaml and rmarkdown R packages. - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-31ad840d814d356e5f98030a4ee308a16db64ec5:0e852a1e4063fdcbe3f254ac2c7469747a60e361-0' : - 'biocontainers/mulled-v2-31ad840d814d356e5f98030a4ee308a16db64ec5:0e852a1e4063fdcbe3f254ac2c7469747a60e361-0' }" + container 'nciccbr/ccbr_diffbind:v1' input: tuple val(meta), path(notebook) @@ -132,4 +126,16 @@ process RMARKDOWNNOTEBOOK { rmarkdown: \$(Rscript -e "cat(paste(packageVersion('rmarkdown'), collapse='.'))") END_VERSIONS """ + + stub: + """ + touch ${notebook.baseName}.html + + R -e 'writeLines(capture.output(sessionInfo()), "session_info.log")' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rmarkdown: \$(Rscript -e "cat(paste(packageVersion('rmarkdown'), collapse='.'))") + END_VERSIONS + """ } diff --git a/modules/nf-core/rmarkdownnotebook/rmarkdownnotebook.diff b/modules/nf-core/rmarkdownnotebook/rmarkdownnotebook.diff new file mode 100644 index 00000000..f71d9002 --- /dev/null +++ b/modules/nf-core/rmarkdownnotebook/rmarkdownnotebook.diff @@ -0,0 +1,37 @@ +Changes in module 'nf-core/rmarkdownnotebook' +--- modules/nf-core/rmarkdownnotebook/main.nf ++++ modules/nf-core/rmarkdownnotebook/main.nf +@@ -4,13 +4,7 @@ + tag "$meta.id" + label 'process_low' + +- //NB: You likely want to override this with a container containing all required +- //dependencies for your analysis. The container at least needs to contain the +- //yaml and rmarkdown R packages. +- conda "${moduleDir}/environment.yml" +- container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +- 'https://depot.galaxyproject.org/singularity/mulled-v2-31ad840d814d356e5f98030a4ee308a16db64ec5:0e852a1e4063fdcbe3f254ac2c7469747a60e361-0' : +- 'biocontainers/mulled-v2-31ad840d814d356e5f98030a4ee308a16db64ec5:0e852a1e4063fdcbe3f254ac2c7469747a60e361-0' }" ++ container 'nciccbr/ccbr_diffbind:v1' + + input: + tuple val(meta), path(notebook) +@@ -132,4 +126,16 @@ + rmarkdown: \$(Rscript -e "cat(paste(packageVersion('rmarkdown'), collapse='.'))") + END_VERSIONS + """ ++ ++ stub: ++ """ ++ touch ${notebook.baseName}.html ++ ++ R -e 'writeLines(capture.output(sessionInfo()), "session_info.log")' ++ ++ cat <<-END_VERSIONS > versions.yml ++ "${task.process}": ++ rmarkdown: \$(Rscript -e "cat(paste(packageVersion('rmarkdown'), collapse='.'))") ++ END_VERSIONS ++ """ + } + +************************************************************ From 2f28edca0f81ff862774573f0c41d03784b646e5 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 7 Dec 2023 18:18:56 -0500 Subject: [PATCH 20/35] refactor: set meta.id --- subworkflows/local/diff/main.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/subworkflows/local/diff/main.nf b/subworkflows/local/diff/main.nf index dcdba3b7..47418856 100644 --- a/subworkflows/local/diff/main.nf +++ b/subworkflows/local/diff/main.nf @@ -57,6 +57,10 @@ workflow DIFF { contrast_meta .combine(Channel.fromPath(file(params.diffbind.report, checkIfExists: true))) + .map{ meta, rmd -> + meta.id = "${meta.contrast}.${meta.tool}" + [ meta, rmd ] + } .set{ch_rmarkdown} RMARKDOWNNOTEBOOK( ch_rmarkdown, [], ch_data_files ) From 79e910efceb5f3557f0de41c6c041184e7603285 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Mon, 11 Dec 2023 11:55:38 -0500 Subject: [PATCH 21/35] fix: set rmarkdown parameters in same channel as files --- modules/nf-core/rmarkdownnotebook/main.nf | 5 ++- .../rmarkdownnotebook/rmarkdownnotebook.diff | 17 +++++++--- subworkflows/local/diff/main.nf | 31 +++++++++---------- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/modules/nf-core/rmarkdownnotebook/main.nf b/modules/nf-core/rmarkdownnotebook/main.nf index 431cc316..f39aeaeb 100644 --- a/modules/nf-core/rmarkdownnotebook/main.nf +++ b/modules/nf-core/rmarkdownnotebook/main.nf @@ -1,14 +1,13 @@ include { dump_params_yml; indent_code_block } from "./parametrize" process RMARKDOWNNOTEBOOK { - tag "$meta.id" + tag { meta.id } label 'process_low' container 'nciccbr/ccbr_diffbind:v1' input: - tuple val(meta), path(notebook) - val parameters + tuple val(meta), val(parameters), path(notebook) path input_files output: diff --git a/modules/nf-core/rmarkdownnotebook/rmarkdownnotebook.diff b/modules/nf-core/rmarkdownnotebook/rmarkdownnotebook.diff index f71d9002..3ffe39f2 100644 --- a/modules/nf-core/rmarkdownnotebook/rmarkdownnotebook.diff +++ b/modules/nf-core/rmarkdownnotebook/rmarkdownnotebook.diff @@ -1,8 +1,12 @@ Changes in module 'nf-core/rmarkdownnotebook' --- modules/nf-core/rmarkdownnotebook/main.nf +++ modules/nf-core/rmarkdownnotebook/main.nf -@@ -4,13 +4,7 @@ - tag "$meta.id" +@@ -1,20 +1,13 @@ + include { dump_params_yml; indent_code_block } from "./parametrize" + + process RMARKDOWNNOTEBOOK { +- tag "$meta.id" ++ tag { meta.id } label 'process_low' - //NB: You likely want to override this with a container containing all required @@ -15,8 +19,13 @@ Changes in module 'nf-core/rmarkdownnotebook' + container 'nciccbr/ccbr_diffbind:v1' input: - tuple val(meta), path(notebook) -@@ -132,4 +126,16 @@ +- tuple val(meta), path(notebook) +- val parameters ++ tuple val(meta), val(parameters), path(notebook) + path input_files + + output: +@@ -132,4 +125,16 @@ rmarkdown: \$(Rscript -e "cat(paste(packageVersion('rmarkdown'), collapse='.'))") END_VERSIONS """ diff --git a/subworkflows/local/diff/main.nf b/subworkflows/local/diff/main.nf index 47418856..7525c8d1 100644 --- a/subworkflows/local/diff/main.nf +++ b/subworkflows/local/diff/main.nf @@ -34,36 +34,35 @@ workflow DIFF { [ "${meta.contrast}.${meta.tool}.csv", row ] } .map{ contrast_file -> - meta = [:] + params = [:] meta_list = contrast_file.baseName.tokenize('.') - meta.contrast = meta_list[0] - meta.tool = meta_list[1] - meta.csvfile = contrast_file.baseName - [ meta, contrast_file ] + params.csvfile = contrast_file.getName() + params.contrast = meta_list[0] + params.tool = meta_list[1] + + meta = [:] + meta.id = "${params.contrast}.${params.tool}" + + [ meta, params, contrast_file ] } - .tap{contrast_file} // [ meta, contrast] - .map{ meta, file -> meta } - .set{contrast_meta} // [ meta ] + .set{ contrast_files } // [ meta, params, contrast ] ch_peaks_contrasts .map{ meta, bam, bai, peak, ctrl_bam, ctrl_bai -> [bam, bai, peak, ctrl_bam, ctrl_bai] } - .mix(contrast_file.map{meta,csv->csv}) + .mix(contrast_files.map{ meta,params,csv -> csv }) .flatten() .unique() .collect() .set{ ch_data_files } - contrast_meta + contrast_files + .map{ meta, params, file -> [ meta, params ] } .combine(Channel.fromPath(file(params.diffbind.report, checkIfExists: true))) - .map{ meta, rmd -> - meta.id = "${meta.contrast}.${meta.tool}" - [ meta, rmd ] - } - .set{ch_rmarkdown} + .set{ ch_rmarkdown } - RMARKDOWNNOTEBOOK( ch_rmarkdown, [], ch_data_files ) + RMARKDOWNNOTEBOOK( ch_rmarkdown, ch_data_files ) emit: diff_peaks = bam_peaks From bbf8bb28540852912fc2695eb7c252fb2fc0e561 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Mon, 11 Dec 2023 12:57:28 -0500 Subject: [PATCH 22/35] fix: csv concatentation w/ newlines --- modules/local/diffbind/prep/main.nf | 2 +- subworkflows/local/diff/main.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/diffbind/prep/main.nf b/modules/local/diffbind/prep/main.nf index 224e98ee..c86cbff2 100644 --- a/modules/local/diffbind/prep/main.nf +++ b/modules/local/diffbind/prep/main.nf @@ -14,7 +14,7 @@ process PREP_DIFFBIND { def csv_text = [ ['SampleID', "Replicate", 'Condition', 'bamReads', "ControlID", "bamControl", 'Peaks', 'PeakCaller'], [meta.id, meta.rep, meta.group, bam, meta.control, ctrl_bam, peak, meta.tool] - ]*.join(',').join(System.lineSeparator) + ]*.join(',').join(System.lineSeparator) + '\n' """ echo -ne "${csv_text}" > ${meta.contrast}.${meta.tool}.${meta.id}.csv """ diff --git a/subworkflows/local/diff/main.nf b/subworkflows/local/diff/main.nf index 7525c8d1..5a20e8d1 100644 --- a/subworkflows/local/diff/main.nf +++ b/subworkflows/local/diff/main.nf @@ -30,7 +30,7 @@ workflow DIFF { ch_peaks_contrasts | PREP_DIFFBIND PREP_DIFFBIND.out.csv - .collectFile(storeDir: "${params.outdir}/diffbind/contrasts") { meta, row -> + .collectFile(storeDir: "${params.outdir}/diffbind/contrasts", newLine: false, keepHeader: true, skip: 1) { meta, row -> [ "${meta.contrast}.${meta.tool}.csv", row ] } .map{ contrast_file -> From 34c480ba18d031214d5ee2a60df167cf434e8da6 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Wed, 13 Dec 2023 11:02:27 -0500 Subject: [PATCH 23/35] fix: add new line to end of converted sicer bed files --- modules/local/peaks.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/peaks.nf b/modules/local/peaks.nf index ae2840af..093157b4 100644 --- a/modules/local/peaks.nf +++ b/modules/local/peaks.nf @@ -185,10 +185,10 @@ process CONVERT_SICER { // https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d5838 outBed[i] = "\t".join(tmp[0:3] + ["Peak"+str(i+1),score]) # output broadPeak columns: chrom, start, end, name, ChIP tag count, strand, fold-enrichment, -log10 p-value, -log10 q-value with open("${sicer_peaks.baseName}.converted.bed",'w') as g: - g.write( "\n".join(outBed) ) + g.write( "\n".join(outBed) + '\n' ) if outBroadPeak[0] != None: with open("${sicer_peaks.baseName}.converted_sicer.broadPeak", 'w') as h: - h.write( "\n".join(outBroadPeak) ) + h.write( "\n".join(outBroadPeak) + '\n' ) /$ stub: From 1f173ef39c903d40040c58ded96ccf5212f0f4b2 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Wed, 13 Dec 2023 13:56:13 -0500 Subject: [PATCH 24/35] fix: don't fail to plot peak widths when gem is only peak caller --- bin/plot_peak_widths.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bin/plot_peak_widths.R b/bin/plot_peak_widths.R index 08f083ae..a7d8479b 100755 --- a/bin/plot_peak_widths.R +++ b/bin/plot_peak_widths.R @@ -13,8 +13,14 @@ tsv_filename <- args[1] peak_dat <- read_tsv(tsv_filename) %>% mutate( peak_width = chromEnd - chromStart, - ) %>% - filter(tool != "gem") # exclude gem because width is always 200 + ) +tools <- peak_dat %>% + pull(tool) %>% + unique() +if (!((length(tools) == 1) & (tools == "gem"))) { + peak_dat <- peak_dat %>% + filter(tool != "gem") # exclude gem because width is always 200 +} sample_ids <- peak_dat %>% pull(sample_id) %>% From 6c39c6b07902a6c031b3d80c0de1ddb618fb704d Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 14 Dec 2023 09:28:43 -0500 Subject: [PATCH 25/35] refactor: switch example contrast names --- assets/contrasts_full_mm10.yml | 2 +- assets/contrasts_test_mm10.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/contrasts_full_mm10.yml b/assets/contrasts_full_mm10.yml index 761a435f..68022fcc 100644 --- a/assets/contrasts_full_mm10.yml +++ b/assets/contrasts_full_mm10.yml @@ -8,5 +8,5 @@ celltype: macrophage: - CTCF_ChIP_macrophage_p20 - CTCF_ChIP_macrophage_p3 - cancer: + fibroblast: - CTCF_ChIP_MEF_p20 diff --git a/assets/contrasts_test_mm10.yml b/assets/contrasts_test_mm10.yml index bfc81f58..90b55cf1 100644 --- a/assets/contrasts_test_mm10.yml +++ b/assets/contrasts_test_mm10.yml @@ -1,5 +1,5 @@ celltype: macrophage: - CTCF_ChIP_macrophage_p20 - MEF: + fibroblast: - CTCF_ChIP_MEF_p20 From 4f93d34e751dba265fab4b6aaef36b5f7079f06e Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 14 Dec 2023 14:12:03 -0500 Subject: [PATCH 26/35] refactor: remove EdgeR diffbind analysis --- assets/diffbind_report.Rmd | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/assets/diffbind_report.Rmd b/assets/diffbind_report.Rmd index 0e88d4c7..b2283f19 100644 --- a/assets/diffbind_report.Rmd +++ b/assets/diffbind_report.Rmd @@ -120,45 +120,33 @@ print(DBdatacontrast)
-## Call differential peaks using Deseq2 and EdgeR +## Call differential peaks using Deseq2 ```{r analyze, echo=FALSE, warning=FALSE,message=FALSE} DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) -DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) ``` ```{r, echo=FALSE, warning=FALSE,message=FALSE} DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2) -DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER) ``` -### PCA: DeSeq2 +### PCA ```{r PCA3, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"} try(dba.plotPCA(DBAnalysisDeseq2, contrast = 1, method = DBA_DESEQ2), silent = TRUE) ``` -### PCA: EdgeR -```{r PCA4, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"} -try(dba.plotPCA(DBAnalysisEdgeR, contrast = 1, method = DBA_EDGER), silent = TRUE) -``` -### MANorm: (left) Deseq2, (right) EdgeR +### MANorm ```{r MA, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"} -par(mfcol = c(1, 2)) + try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2), silent = TRUE) -try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER), silent = TRUE) ``` -### Volcano plot: DeSeq2 +### Volcano plot ```{r Volcano1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2), silent = TRUE) ``` -### Volcano plot: EdgeR -```{r Volcano2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} -try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER), silent = TRUE) -``` - -### Boxplots: (left) Deseq2, (right) EdgeR +### Boxplots ```{r BoxPlot, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"} par(mfcol = c(1, 2)) if (length(DBReportDeseq2) > 0) { @@ -166,23 +154,15 @@ if (length(DBReportDeseq2) > 0) { } else { plot(0, type = "n", axes = FALSE, ann = FALSE) } -try(dba.plotBox(DBAnalysisEdgeR, method = DBA_EDGER), silent = TRUE) ``` -## Differentially bound peaks: Deseq2 output +## Differentially bound peaks ```{r Deseq2Report, echo=FALSE, warning=FALSE,message=FALSE} outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.txt") write.table(DBReportDeseq2, outfile, quote = F, sep = "\t", row.names = F) DT::datatable(data.frame(DBReportDeseq2), rownames = F) ``` -## Differentially bound peaks: EdgeR output -```{r EdgeRReport, echo=FALSE, warning=FALSE,message=FALSE} -outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_EdgeR.txt") -write.table(DBReportEdgeR, outfile, quote = F, sep = "\t", row.names = F) -DT::datatable(data.frame(DBReportEdgeR), rownames = F) -``` - ## R tool version information ```{r Info, echo=FALSE, message=FALSE, warning=FALSE} sessionInfo() From 448eaf3809d522b27f80a82a4789f1ce3cae22ca Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 14 Dec 2023 16:45:21 -0500 Subject: [PATCH 27/35] fix: only exclude gem from peak widths plot of there's at least two peak callers --- bin/plot_peak_widths.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/plot_peak_widths.R b/bin/plot_peak_widths.R index a7d8479b..0d086ae4 100755 --- a/bin/plot_peak_widths.R +++ b/bin/plot_peak_widths.R @@ -17,7 +17,7 @@ peak_dat <- read_tsv(tsv_filename) %>% tools <- peak_dat %>% pull(tool) %>% unique() -if (!((length(tools) == 1) & (tools == "gem"))) { +if (length(tools) > 1) { peak_dat <- peak_dat %>% filter(tool != "gem") # exclude gem because width is always 200 } From 109f24b0ac759e9d68662483b6363e930c3e0754 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 14 Dec 2023 16:46:30 -0500 Subject: [PATCH 28/35] refactor: run manorm if any sample has only one replicate otherwise run diffbind --- main.nf | 18 +++- modules/local/manorm/main.nf | 35 ++++++++ subworkflows/local/{diff => diffbind}/main.nf | 49 ++--------- subworkflows/local/differential/main.nf | 88 +++++++++++++++++++ subworkflows/local/manorm/main.nf | 17 ++++ subworkflows/local/peaks.nf | 11 +++ 6 files changed, 172 insertions(+), 46 deletions(-) create mode 100644 modules/local/manorm/main.nf rename subworkflows/local/{diff => diffbind}/main.nf (51%) create mode 100644 subworkflows/local/differential/main.nf create mode 100644 subworkflows/local/manorm/main.nf diff --git a/main.nf b/main.nf index 99922987..7bc459fe 100644 --- a/main.nf +++ b/main.nf @@ -27,7 +27,7 @@ include { QC } from './subworkflows/local/qc.nf' include { CALL_PEAKS } from './subworkflows/local/peaks.nf' include { CONSENSUS_PEAKS } from './subworkflows/CCBR/consensus_peaks/' include { ANNOTATE } from './subworkflows/local/annotate.nf' -include { DIFF } from './subworkflows/local/diff/' +include { DIFF } from './subworkflows/local/differential/' // MODULES include { CUTADAPT } from "./modules/CCBR/cutadapt" @@ -74,7 +74,7 @@ workflow CHIPSEQ { deduped_bam | PHANTOM_PEAKS PHANTOM_PEAKS.out.fraglen | PPQT_PROCESS - PPQT_PROCESS.out.fraglen.set {frag_lengths } + PPQT_PROCESS.out.fraglen.set { frag_lengths } ch_multiqc = Channel.of() if (params.run.qc) { @@ -124,10 +124,20 @@ workflow CHIPSEQ { CALL_PEAKS.out.bam_peaks .combine(deduped_bam) .map{meta1, bam1, bai1, peak, tool, meta2, bam2, bai2 -> - meta1.control == meta2.id ? [ meta1.sample_basename, meta1 + [tool: tool], bam1, bai1, peak, bam2, bai2 ] : null + meta1.control == meta2.id ? [ meta1 + [tool: tool], bam1, bai1, peak, bam2, bai2 ] : null } .set{bam_peaks} - DIFF( bam_peaks, INPUT_CHECK.out.csv, contrasts ) + CALL_PEAKS.out.tagalign_peaks + .join(frag_lengths) + .map{ meta, tagalign, peak, tool, frag_len -> + [ meta + [tool: tool, fraglen: frag_len], tagalign, peak ] + } + .set{ tagalign_peaks } + DIFF( bam_peaks, + tagalign_peaks, + INPUT_CHECK.out.csv, + contrasts + ) } } diff --git a/modules/local/manorm/main.nf b/modules/local/manorm/main.nf new file mode 100644 index 00000000..45378dc9 --- /dev/null +++ b/modules/local/manorm/main.nf @@ -0,0 +1,35 @@ +process MANORM_PAIRWISE { + """ + adapted from: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/Rules/ChIPseq.snakefile#L709-L785 + """ + tag "${meta1.id}.${meta2.id}" + label 'process_single' + + container '' + + input: + tuple val(meta1), val(meta2), path(tagalign1), path(tagalign2), path(peak1), path(peak2) + + output: + path("manorm_output/*"), emit: dir + + script: + """ + manorm \\ + --p1 ${peak1} \\ + --p2 ${peak2} \\ + --r1 ${tagalign2} \\ + --r2 ${tagalign2} \\ + --s1 ${meta1.fraglen} \\ + --s2 ${meta2.fraglen} \\ + -o manorm_output/ \\ + --name1 ${meta1.group} \\ + --name2 ${meta2.group} + """ + + stub: + """ + mkdir manorm_output/ + touch manorm_output/blank.txt + """ +} diff --git a/subworkflows/local/diff/main.nf b/subworkflows/local/diffbind/main.nf similarity index 51% rename from subworkflows/local/diff/main.nf rename to subworkflows/local/diffbind/main.nf index 5a20e8d1..36c9cca9 100644 --- a/subworkflows/local/diff/main.nf +++ b/subworkflows/local/diffbind/main.nf @@ -1,33 +1,11 @@ -include { CHECK_CONTRASTS } from "../../../modules/local/check_contrasts/" include { PREP_DIFFBIND } from "../../../modules/local/diffbind/prep/" include { RMARKDOWNNOTEBOOK } from '../../../modules/nf-core/rmarkdownnotebook/' -workflow DIFF { +workflow DIFFBIND { take: - bam_peaks - samplesheet_file - contrasts_file - + ch_bam_peaks_contrasts main: - - CHECK_CONTRASTS(samplesheet_file, contrasts_file) - .csv - .flatten() - .splitCsv( header: true, sep: ',' ) - .map{ it -> - meta = get_contrast_meta(it) - [ meta.sample_basename, [group: meta.group, contrast: meta.contrast] ] - } - .unique() - .set{ contrasts } - bam_peaks - .combine( contrasts ) - .map{ sample_basename1, peak_meta, bam, bai, peak, ctrl_bam, ctrl_bai, sample_basename2, con_meta -> - sample_basename1 == sample_basename2 ? [ peak_meta + con_meta, bam, bai, peak, ctrl_bam, ctrl_bai ] : null - } - .unique() - .set{ ch_peaks_contrasts } - ch_peaks_contrasts | PREP_DIFFBIND + ch_bam_peaks_contrasts | PREP_DIFFBIND PREP_DIFFBIND.out.csv .collectFile(storeDir: "${params.outdir}/diffbind/contrasts", newLine: false, keepHeader: true, skip: 1) { meta, row -> @@ -47,7 +25,7 @@ workflow DIFF { } .set{ contrast_files } // [ meta, params, contrast ] - ch_peaks_contrasts + ch_bam_peaks_contrasts .map{ meta, bam, bai, peak, ctrl_bam, ctrl_bai -> [bam, bai, peak, ctrl_bam, ctrl_bai] } @@ -65,20 +43,7 @@ workflow DIFF { RMARKDOWNNOTEBOOK( ch_rmarkdown, ch_data_files ) emit: - diff_peaks = bam_peaks - -} - -def get_contrast_meta(LinkedHashMap row) { - def meta = [:] - meta.id = row.sample - meta.sample_basename = row.sample_basename - meta.rep = row.rep - meta.single_end = row.single_end.toBoolean() - meta.antibody = row.antibody - meta.control = row.control - meta.group = row.group - meta.contrast = row.contrast - - return meta + report = RMARKDOWNNOTEBOOK.out.report + artifacts = RMARKDOWNNOTEBOOK.out.artifacts + versions = RMARKDOWNNOTEBOOK.out.versions } diff --git a/subworkflows/local/differential/main.nf b/subworkflows/local/differential/main.nf new file mode 100644 index 00000000..11325b26 --- /dev/null +++ b/subworkflows/local/differential/main.nf @@ -0,0 +1,88 @@ +include { CHECK_CONTRASTS } from "../../../modules/local/check_contrasts/" +include { DIFFBIND } from "../../../subworkflows/local/diffbind/" +include { MANORM } from "../../../subworkflows/local/manorm/" + +workflow DIFF { + take: + bam_peaks + tagalign_peaks + samplesheet_file + contrasts_file + + main: + + CHECK_CONTRASTS(samplesheet_file, contrasts_file) + .csv + .flatten() + .splitCsv( header: true, sep: ',' ) + .map{ it -> + meta = get_contrast_meta(it) + [ sample_basename: meta.sample_basename, group: meta.group, contrast: meta.contrast ] + } + .unique() + .set{ contrasts } + + bam_peaks + .map{ meta, bam, bai, peak, ctrl_bam, ctrl_bai -> [meta.sample_basename, meta.rep] } + .groupTuple() + .map{ sample_basename, rep_list -> + rep_list.unique().size() + } + .min() // if any sample only has one replicate, do MAnorm, otherwise do diffbind + .set{ min_reps } + + // prepare bam channel for diffbind + bam_peaks + .combine( contrasts ) + .map{ peak_meta, bam, bai, peak, ctrl_bam, ctrl_bai, con_meta -> + peak_meta.sample_basename == con_meta.sample_basename ? [ peak_meta + con_meta, bam, bai, peak, ctrl_bam, ctrl_bai ] : null + } + .unique() + .combine(min_reps) + .branch{ meta, bam, bai, peak, ctrl_bam, ctrl_bai, min_reps -> + manorm: min_reps == 1 + return (null) // tagalign files used instead of bam for manorm + diffbind: min_reps >= 2 + return (tuple(meta, bam, bai, peak, ctrl_bam, ctrl_bai)) + } + .set{ bam_diff } + // run diffbind on all samples if every sample has >= 2 replicates + bam_diff.diffbind | DIFFBIND + + // prepare tagalign channel for manorm + tagalign_peaks + .combine(contrasts) + .map{ peak_meta, tagalign, peak, con_meta -> + peak_meta.sample_basename == con_meta.sample_basename ? [ peak_meta + con_meta, tagalign, peak ] : null + } + .unique() + .combine(min_reps) + .branch{ meta, tagalign, peak, min_reps -> + manorm: min_reps == 1 + return (tuple(meta, tagalign, peak)) + diffbind: min_reps >= 2 + return (null) + } + .set{ tagalign_diff } + // run manorm on all samples if any sample has only 1 replicate + tagalign_diff.manorm | MANORM + + emit: + diff_peaks = bam_peaks + // TODO + +} + +def get_contrast_meta(LinkedHashMap row) { + def meta = [:] + meta.id = row.sample + meta.sample_basename = row.sample_basename + meta.rep = row.rep + meta.single_end = row.single_end.toBoolean() + meta.antibody = row.antibody + meta.control = row.control + meta.group = row.group + meta.contrast = row.contrast + + return meta +} diff --git a/subworkflows/local/manorm/main.nf b/subworkflows/local/manorm/main.nf new file mode 100644 index 00000000..333fe425 --- /dev/null +++ b/subworkflows/local/manorm/main.nf @@ -0,0 +1,17 @@ +include { MANORM_PAIRWISE } from "../../../modules/local/manorm/" + +workflow MANORM { + + take: + ch_tagalign_peaks + main: + ch_tagalign_peaks.view() + + ch_tagalign_peaks + .combine(ch_tagalign_peaks) + .map{ meta1, tag1, peak1, meta2, tag2, peak2 -> + (meta1.tool == meta2.tool && meta1.contrast == meta2.contrast && meta1.group != meta2.group) ? [ meta1, meta2, tag1, tag2, peak1, peak2 ] : null + } + | MANORM_PAIRWISE + +} diff --git a/subworkflows/local/peaks.nf b/subworkflows/local/peaks.nf index 20ad040b..45317f27 100644 --- a/subworkflows/local/peaks.nf +++ b/subworkflows/local/peaks.nf @@ -96,6 +96,15 @@ workflow CALL_PEAKS { ch_peaks = ch_peaks.mix(FILTER_GEM.out.peak) } + // Create tag align w/ peaks + tag_all_bed.cross(ch_peaks) + .map{ it -> + it.flatten() + } + .map{ meta1, tagalign, meta2, peak, tool -> + meta1 == meta2 ? [ meta1, tagalign, peak, tool ] : null + } + .set{ ch_tagalign_peaks } // Create Channel with meta, deduped bam, peak file, peak-calling tool, and chrom sizes fasta deduped_bam.cross(ch_peaks) .map{ it -> @@ -105,6 +114,7 @@ workflow CALL_PEAKS { meta1 == meta2 ? [ meta1, bam, bai, peak, tool ] : null } .set{ ch_bam_peaks } + ch_bam_peaks.combine(chrom_sizes) | FRACTION_IN_PEAKS FRACTION_IN_PEAKS.out.collect() | CONCAT_FRIPS | PLOT_FRIP @@ -126,5 +136,6 @@ workflow CALL_PEAKS { emit: peaks = ch_peaks bam_peaks = ch_bam_peaks + tagalign_peaks = ch_tagalign_peaks plots = ch_plots } From 84538fcd35deedb0064ecaa4913758dc346c19d5 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 14 Dec 2023 16:49:24 -0500 Subject: [PATCH 29/35] chore: add diff subwf to changelog --- CHANGELOG.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 02682cb3..fa9f01c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,20 @@ ## development version +### New features + - Print the recommended citation in bibtex format with `champagne --citation`. (#153) -- Support multiple replicates per sample and call consensus peaks. (#129) +- Support multiple replicates per sample and call consensus peaks on replicates. (#129) + - Optionally normalize p-values. - Find motifs in the genome with Homer. (#142) - Run motif enrichment analysis with MEME. (#142) - Annotate peaks with chipseeker. (#142,#147,#157) - Add preseq complexity curve and fastq screen to multiqc report. (#147) +- Implement differential peak calling. (#158) + - Optionally specify contrasts via a YAML file. + - If any sample has only one replicate, run `MAnorm`, otherwise run `diffbind`. + +### Bug fixes + - Fix deepTools plots (#144): - Per sample fingerprint plots instead of per replicate - Input normalized profile plots From e34c9f5afdf02b47258d69ae5b2a2fa442bc55a5 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Fri, 15 Dec 2023 14:47:05 -0500 Subject: [PATCH 30/35] fix: typo --- conf/test_mm10.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test_mm10.config b/conf/test_mm10.config index 44160c46..47086230 100644 --- a/conf/test_mm10.config +++ b/conf/test_mm10.config @@ -5,7 +5,7 @@ params { genome = 'mm10' outdir = "results/test_mm10" input = "assets/samplesheet_test_mm10.csv" - contrats = 'assets/contrasts_test_mm10.yml' + contrasts = 'assets/contrasts_test_mm10.yml' read_length = 50 sicer.species = "mm10" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py From 7f1dc30657f9a37d5d39a9b24620b9fa7f0f0a28 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Fri, 15 Dec 2023 15:32:53 -0500 Subject: [PATCH 31/35] feat: add manorm container runs successfully on test_mm10 --- modules/local/manorm/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/manorm/main.nf b/modules/local/manorm/main.nf index 45378dc9..c7dbb694 100644 --- a/modules/local/manorm/main.nf +++ b/modules/local/manorm/main.nf @@ -5,7 +5,7 @@ process MANORM_PAIRWISE { tag "${meta1.id}.${meta2.id}" label 'process_single' - container '' + container 'nciccbr/ccbr_manorm:v1' input: tuple val(meta1), val(meta2), path(tagalign1), path(tagalign2), path(peak1), path(peak2) From 0597c2b04be7256b91dcbfbc105058583f79db49 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Fri, 15 Dec 2023 17:40:15 -0500 Subject: [PATCH 32/35] fix: increase resource allocs --- modules/local/diffbind/prep/main.nf | 2 +- modules/local/peaks.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/diffbind/prep/main.nf b/modules/local/diffbind/prep/main.nf index c86cbff2..c4d00793 100644 --- a/modules/local/diffbind/prep/main.nf +++ b/modules/local/diffbind/prep/main.nf @@ -1,6 +1,6 @@ process PREP_DIFFBIND { tag { "${meta.id}.${meta.contrast}.${meta.tool}" } - label 'process_single' + label 'process_low' container "${params.containers.base}" diff --git a/modules/local/peaks.nf b/modules/local/peaks.nf index 093157b4..7f129d97 100644 --- a/modules/local/peaks.nf +++ b/modules/local/peaks.nf @@ -378,7 +378,7 @@ process PLOT_FRIP { process JACCARD_INDEX { tag "${toolA} ${metaA.id} vs. ${toolB} ${metaB.id}" label 'peaks' - label 'process_single' + label 'process_low' container "${params.containers.base}" From 2feca7b5d8db3421284dba38d6e8f8ca1882d8c7 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Mon, 18 Dec 2023 14:55:32 -0500 Subject: [PATCH 33/35] fix: peakFormat is bed --- assets/diffbind_report.Rmd | 2 +- subworkflows/local/differential/main.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/diffbind_report.Rmd b/assets/diffbind_report.Rmd index b2283f19..926ddbb5 100644 --- a/assets/diffbind_report.Rmd +++ b/assets/diffbind_report.Rmd @@ -39,7 +39,7 @@ suppressMessages(library(DiffBind)) ## Read in sample sheet information and peak information ```{r samples, echo=FALSE, warning=FALSE,message=FALSE} -samples <- dba(sampleSheet = csvfile) +samples <- dba(sampleSheet = csvfile, peakFormat = 'bed') consensus <- dba.peakset(samples, consensus = DBA_CONDITION) print(samples) ``` diff --git a/subworkflows/local/differential/main.nf b/subworkflows/local/differential/main.nf index 11325b26..4115e08a 100644 --- a/subworkflows/local/differential/main.nf +++ b/subworkflows/local/differential/main.nf @@ -60,7 +60,7 @@ workflow DIFF { .branch{ meta, tagalign, peak, min_reps -> manorm: min_reps == 1 return (tuple(meta, tagalign, peak)) - diffbind: min_reps >= 2 + diffbind: min_reps >= 2 // bam files used instead of tagalign for diffbind return (null) } .set{ tagalign_diff } From 105430557e625ba6afa23f423b929cb7acfc1a85 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Mon, 18 Dec 2023 15:58:29 -0500 Subject: [PATCH 34/35] refactor: improve manorm outdir name --- modules/local/manorm/main.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/local/manorm/main.nf b/modules/local/manorm/main.nf index c7dbb694..e562e623 100644 --- a/modules/local/manorm/main.nf +++ b/modules/local/manorm/main.nf @@ -11,7 +11,7 @@ process MANORM_PAIRWISE { tuple val(meta1), val(meta2), path(tagalign1), path(tagalign2), path(peak1), path(peak2) output: - path("manorm_output/*"), emit: dir + path("${meta1.id}_vs_${meta2.id}/*"), emit: dir script: """ @@ -22,14 +22,14 @@ process MANORM_PAIRWISE { --r2 ${tagalign2} \\ --s1 ${meta1.fraglen} \\ --s2 ${meta2.fraglen} \\ - -o manorm_output/ \\ + -o ${meta1.id}_vs_${meta2.id}/ \\ --name1 ${meta1.group} \\ --name2 ${meta2.group} """ stub: """ - mkdir manorm_output/ - touch manorm_output/blank.txt + mkdir ${meta1.id}_vs_${meta2.id}/ + touch ${meta1.id}_vs_${meta2.id}/blank.txt """ } From 049b00985e26f110645492cde8065002e250eeac Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Mon, 18 Dec 2023 15:59:50 -0500 Subject: [PATCH 35/35] refactor: improve diffbind rmarkdown notebook process name --- subworkflows/local/diffbind/main.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/diffbind/main.nf b/subworkflows/local/diffbind/main.nf index 36c9cca9..f50e53f0 100644 --- a/subworkflows/local/diffbind/main.nf +++ b/subworkflows/local/diffbind/main.nf @@ -1,5 +1,5 @@ include { PREP_DIFFBIND } from "../../../modules/local/diffbind/prep/" -include { RMARKDOWNNOTEBOOK } from '../../../modules/nf-core/rmarkdownnotebook/' +include { RMARKDOWNNOTEBOOK as DIFFBIND_RMD } from '../../../modules/nf-core/rmarkdownnotebook/' workflow DIFFBIND { take: @@ -40,10 +40,10 @@ workflow DIFFBIND { .combine(Channel.fromPath(file(params.diffbind.report, checkIfExists: true))) .set{ ch_rmarkdown } - RMARKDOWNNOTEBOOK( ch_rmarkdown, ch_data_files ) + DIFFBIND_RMD( ch_rmarkdown, ch_data_files ) emit: - report = RMARKDOWNNOTEBOOK.out.report - artifacts = RMARKDOWNNOTEBOOK.out.artifacts - versions = RMARKDOWNNOTEBOOK.out.versions + report = DIFFBIND_RMD.out.report + artifacts = DIFFBIND_RMD.out.artifacts + versions = DIFFBIND_RMD.out.versions }