CCBR · kelly-sovacool · Dec 18, 2023 · Nov 28, 2023 · Dec 4, 2023 · Dec 5, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,20 @@
 ## development version
 
+### New features
+
 - Print the recommended citation in bibtex format with `champagne --citation`. (#153)
-- Support multiple replicates per sample and call consensus peaks. (#129)
+- Support multiple replicates per sample and call consensus peaks on replicates. (#129)
+  - Optionally normalize p-values.
 - Find motifs in the genome with Homer. (#142)
 - Run motif enrichment analysis with MEME. (#142)
 - Annotate peaks with chipseeker. (#142,#147,#157)
 - Add preseq complexity curve and fastq screen to multiqc report. (#147)
+- Implement differential peak calling. (#158)
+  - Optionally specify contrasts via a YAML file.
+  - If any sample has only one replicate, run `MAnorm`, otherwise run `diffbind`.
+
+### Bug fixes
+
 - Fix deepTools plots (#144):
   - Per sample fingerprint plots instead of per replicate
   - Input normalized profile plots

diff --git a/assets/contrasts_full_mm10.yml b/assets/contrasts_full_mm10.yml
@@ -0,0 +1,12 @@
+antibody:
+    p20:
+      - CTCF_ChIP_macrophage_p20
+      - CTCF_ChIP_MEF_p20
+    p3:
+      - CTCF_ChIP_macrophage_p3
+celltype:
+    macrophage:
+      - CTCF_ChIP_macrophage_p20
+      - CTCF_ChIP_macrophage_p3
+    fibroblast:
+      - CTCF_ChIP_MEF_p20
diff --git a/assets/contrasts_test.yml b/assets/contrasts_test.yml
@@ -0,0 +1,10 @@
+timepoint:
+  T0:
+    - SPT5_T0
+  T15:
+    - SPT5_T15
+condition:
+  control:
+    - SPT5_T0
+  treatment:
+    - SPT5_T15
diff --git a/assets/contrasts_test_mm10.yml b/assets/contrasts_test_mm10.yml
@@ -0,0 +1,5 @@
+celltype:
+  macrophage:
+    - CTCF_ChIP_macrophage_p20
+  fibroblast:
+    - CTCF_ChIP_MEF_p20
diff --git a/assets/diffbind_report.Rmd b/assets/diffbind_report.Rmd
@@ -0,0 +1,169 @@
+---
+title: "DiffBind: CCBR ChIP-seq pipeline"
+output:
+    html_document:
+        toc: true
+        toc_depth: 2
+params:
+    csvfile: samplesheet.csv
+    contrast: "group1_vs_group2"
+    tool: "macs"
+---
+
+<!--
+source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/Results-template/Scripts/DiffBind_pipeliner.Rmd
+-->
+
+```{r, include=FALSE, warning=FALSE, message=FALSE}
+dateandtime <- format(Sys.time(), "%a %b %d %Y - %X")
+
+csvfile <- params$csvfile
+contrasts <- params$contrast
+peakcaller <- params$tool
+```
+
+### **Groups being compared:**
+####    *`r contrasts`*
+### **Peak sources:**
+####    *`r peakcaller`*
+### **Report generated:**
+####    *`r dateandtime`*
+
+```{r setup, echo=FALSE, warning=FALSE,message=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+suppressMessages(library(DT))
+suppressMessages(library(DiffBind))
+```
+
+<br/>
+
+## Read in sample sheet information and peak information
+```{r samples, echo=FALSE, warning=FALSE,message=FALSE}
+samples <- dba(sampleSheet = csvfile, peakFormat = 'bed')
+consensus <- dba.peakset(samples, consensus = DBA_CONDITION)
+print(samples)
+```
+
+<br/>
+
+## Plot raw information about the peaks
+### Correlation heatmap: Only peaks
+```{r heatmap1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
+try(plot(samples, main = ""), silent = TRUE)
+```
+
+### PCA: Only peaks
+```{r PCA1, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center",fig.caption="PCA:\nOnlyPeaks"}
+try(dba.plotPCA(samples, DBA_CONDITION), silent = TRUE)
+```
+
+### Overlapping peak counts
+```{r Venn, echo=FALSE, warning=FALSE,message=FALSE,fig.align="center",fig.height=5,fig.width=5}
+if (nrow(samples$samples) < 5) {
+  dba.plotVenn(samples, 1:nrow(samples$samples))
+} else {
+  dba.plotVenn(samples, samples$masks[[3]])
+  dba.plotVenn(samples, samples$masks[[4]])
+  dba.plotVenn(consensus, consensus$masks$Consensus)
+}
+```
+
+```{r peaksORsummits, echo=F}
+if (grepl("narrow", samples$samples$Peaks[1])) {
+  summits <- TRUE
+  print("Narrow peak calling tool.")
+  print("Differential peaks are 250bp upstream and downstream of the summits.")
+} else if (grepl("broad", samples$samples$Peaks[1])) {
+  summits <- FALSE
+  print("Broad peak calling tool.")
+  print("Differential peaks are consensus peaks.")
+} else {
+  summits <- FALSE
+  print("Indeterminate peak calling tool.")
+  print("Differential peaks are consensus peaks.")
+}
+```
+
+## Read in bam file information under all peaks found in at least two samples
+```{r DBcount, echo=FALSE, warning=FALSE,message=FALSE}
+if (summits == TRUE) {
+  DBdataCounts <- dba.count(samples, summits = 250)
+} else {
+  DBdataCounts <- dba.count(samples)
+}
+print(DBdataCounts)
+```
+
+<br/>
+
+## Plot raw information about all analyzed peaks
+### Correlation heatmap: Peaks and reads
+```{r heatmap2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
+try(plot(DBdataCounts, main = ""), silent = TRUE)
+```
+
+### Heatmap: Average signal across each peak
+```{r heatmap3, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
+try(dba.plotHeatmap(DBdataCounts, correlations = FALSE), silent = TRUE)
+```
+
+### PCA: Peaks and reads
+```{r PCA2, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"}
+try(dba.plotPCA(DBdataCounts, DBA_CONDITION), silent = TRUE)
+```
+
+## Associate individual samples with the different contrasts
+```{r contrast, echo=FALSE, warning=FALSE,message=FALSE}
+DBdatacontrast <- dba.contrast(DBdataCounts, minMembers = 2, categories = DBA_CONDITION)
+print(DBdatacontrast)
+```
+
+<br/>
+
+## Call differential peaks using Deseq2
+```{r analyze, echo=FALSE, warning=FALSE,message=FALSE}
+DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2)
+```
+
+```{r, echo=FALSE, warning=FALSE,message=FALSE}
+DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2)
+```
+
+### PCA
+```{r PCA3, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"}
+try(dba.plotPCA(DBAnalysisDeseq2, contrast = 1, method = DBA_DESEQ2), silent = TRUE)
+```
+
+
+### MANorm
+```{r MA, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"}
+
+try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2), silent = TRUE)
+```
+
+### Volcano plot
+```{r Volcano1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
+try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2), silent = TRUE)
+```
+
+### Boxplots
+```{r BoxPlot, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"}
+par(mfcol = c(1, 2))
+if (length(DBReportDeseq2) > 0) {
+  try(dba.plotBox(DBAnalysisDeseq2, method = DBA_DESEQ2), silent = TRUE)
+} else {
+  plot(0, type = "n", axes = FALSE, ann = FALSE)
+}
+```
+
+## Differentially bound peaks
+```{r Deseq2Report, echo=FALSE, warning=FALSE,message=FALSE}
+outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.txt")
+write.table(DBReportDeseq2, outfile, quote = F, sep = "\t", row.names = F)
+DT::datatable(data.frame(DBReportDeseq2), rownames = F)
+```
+
+## R tool version information
+```{r Info, echo=FALSE, message=FALSE, warning=FALSE}
+sessionInfo()
+```
diff --git a/bin/plot_peak_widths.R b/bin/plot_peak_widths.R
@@ -13,8 +13,14 @@ tsv_filename <- args[1]
 peak_dat <- read_tsv(tsv_filename) %>%
   mutate(
     peak_width = chromEnd - chromStart,
-  ) %>%
-  filter(tool != "gem") # exclude gem because width is always 200
+  )
+tools <- peak_dat %>%
+  pull(tool) %>%
+  unique()
+if (length(tools) > 1) {
+  peak_dat <- peak_dat %>%
+    filter(tool != "gem") # exclude gem because width is always 200
+}
 
 sample_ids <- peak_dat %>%
   pull(sample_id) %>%

diff --git a/conf/full_mm10.config b/conf/full_mm10.config
@@ -5,6 +5,7 @@ params {
     genome = 'mm10'
     outdir = "results/full_mm10"
     input = "assets/samplesheet_full_mm10.csv"
+    contrasts = 'assets/contrasts_full_mm10.yml'
     sicer {
         species = "mm10" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py
     }

diff --git a/conf/test.config b/conf/test.config
@@ -4,6 +4,7 @@ params {
 
     outdir = 'results/test'
     input = 'assets/samplesheet_test.csv' // adapted from 'https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/samplesheet/v2.0/samplesheet_test.csv'
+    contrasts = 'assets/contrasts_test.yml'
 
     genome = 'sacCer3'
     read_length = 50

diff --git a/conf/test_mm10.config b/conf/test_mm10.config
@@ -5,6 +5,7 @@ params {
     genome = 'mm10'
     outdir = "results/test_mm10"
     input = "assets/samplesheet_test_mm10.csv"
+    contrasts = 'assets/contrasts_test_mm10.yml'
     read_length = 50
     sicer.species = "mm10" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py
 

diff --git a/main.nf b/main.nf
@@ -18,7 +18,6 @@ log.info """\
          .stripIndent()
 
 // SUBWORKFLOWS
-
 include { INPUT_CHECK              } from './subworkflows/local/input_check.nf'
 include { PREPARE_GENOME           } from './subworkflows/local/prepare_genome.nf'
 include { FILTER_BLACKLIST         } from './subworkflows/CCBR/filter_blacklist/'
@@ -28,13 +27,13 @@ include { QC                       } from './subworkflows/local/qc.nf'
 include { CALL_PEAKS               } from './subworkflows/local/peaks.nf'
 include { CONSENSUS_PEAKS          } from './subworkflows/CCBR/consensus_peaks/'
 include { ANNOTATE                 } from './subworkflows/local/annotate.nf'
+include { DIFF                     } from './subworkflows/local/differential/'
 
 // MODULES
 include { CUTADAPT                 } from "./modules/CCBR/cutadapt"
-include { PHANTOM_PEAKS            } from "./modules/local/qc.nf"
-include { PPQT_PROCESS
+include { PHANTOM_PEAKS
+          PPQT_PROCESS
           MULTIQC                  } from "./modules/local/qc.nf"
-include { NORMALIZE_INPUT          } from "./modules/local/deeptools.nf"
 
 workflow.onComplete {
     if (!workflow.stubRun && !workflow.commandLine.contains('-preview')) {
@@ -55,7 +54,8 @@ workflow {
 }
 
 workflow CHIPSEQ {
-    INPUT_CHECK(file(params.input), params.seq_center)
+    INPUT_CHECK(file(params.input, checkIfExists: true), params.seq_center)
+
     INPUT_CHECK.out.reads.set { raw_fastqs }
     raw_fastqs | CUTADAPT
     CUTADAPT.out.reads.set{ trimmed_fastqs }
@@ -74,7 +74,7 @@ workflow CHIPSEQ {
 
     deduped_bam | PHANTOM_PEAKS
     PHANTOM_PEAKS.out.fraglen | PPQT_PROCESS
-    PPQT_PROCESS.out.fraglen.set {frag_lengths }
+    PPQT_PROCESS.out.fraglen.set { frag_lengths }
 
     ch_multiqc = Channel.of()
     if (params.run.qc) {
@@ -110,6 +110,36 @@ workflow CHIPSEQ {
                  PREPARE_GENOME.out.bioc_annot)
         ch_multiqc = ch_multiqc.mix(CALL_PEAKS.out.plots, ANNOTATE.out.plots)
 
+        // retrieve sample basename and peak-calling tool from metadata
+        CONSENSUS_PEAKS.out.peaks
+            .map{ meta, bed ->
+                meta_split = meta.id.tokenize('.')
+                assert meta_split.size() == 2
+                [ [ sample_basename: meta_split[0], tool: meta_split[1] ], bed ]
+            }
+            .set{ ch_consensus_peaks }
+        if (params.contrasts) {
+            contrasts = file(params.contrasts, checkIfExists: true)
+            // TODO use consensus peaks for regions of interest in diffbind
+            CALL_PEAKS.out.bam_peaks
+                .combine(deduped_bam)
+                .map{meta1, bam1, bai1, peak, tool, meta2, bam2, bai2 ->
+                    meta1.control == meta2.id ? [ meta1 + [tool: tool], bam1, bai1, peak, bam2, bai2 ] : null
+                }
+                .set{bam_peaks}
+            CALL_PEAKS.out.tagalign_peaks
+                .join(frag_lengths)
+                .map{ meta, tagalign, peak, tool, frag_len ->
+                    [ meta + [tool: tool, fraglen: frag_len], tagalign, peak ]
+                }
+                .set{ tagalign_peaks }
+            DIFF( bam_peaks,
+                  tagalign_peaks,
+                  INPUT_CHECK.out.csv,
+                  contrasts
+                )
+
+        }
     }
 
     MULTIQC(

diff --git a/modules.json b/modules.json
@@ -99,6 +99,12 @@
                         "branch": "master",
                         "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a",
                         "installed_by": ["modules"]
+                    },
+                    "rmarkdownnotebook": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/rmarkdownnotebook/rmarkdownnotebook.diff"
                     }
                 }
             }

diff --git a/modules/local/check_contrasts/main.nf b/modules/local/check_contrasts/main.nf
@@ -0,0 +1,18 @@
+process CHECK_CONTRASTS {
+    tag { contrasts }
+    label 'process_single'
+
+    container 'nciccbr/consensus_peaks:v1.1'
+
+    input:
+        path(samplesheet)
+        path(contrasts)
+
+    output:
+        path("*.csv"),        emit: csv
+        path("versions.yml"), emit: versions
+
+    script:
+    output_basename = "sample_contrasts"
+    template 'check_contrasts.R'
+}