From b00d9dfef9a855f3dd22f5f432b3ea1995c49c86 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Thu, 16 Nov 2023 10:03:37 -0500
Subject: [PATCH] feat: add chipseeker plots to annotation step

separate process for plotAnnoBar from list of annotations
---
 bin/annotate_peaks.R                      | 110 +++++++++++++---------
 bin/chipseeker_plotlist.R                 |  18 ++++
 modules/local/chipseeker/annotate/main.nf |  16 ++--
 modules/local/chipseeker/plotlist/main.nf |  25 +++++
 subworkflows/local/peaks.nf               |  16 +++-
 5 files changed, 128 insertions(+), 57 deletions(-)
 create mode 100644 bin/chipseeker_plotlist.R
 create mode 100644 modules/local/chipseeker/plotlist/main.nf

diff --git a/bin/annotate_peaks.R b/bin/annotate_peaks.R
index 08ada7dc..96797f2b 100755
--- a/bin/annotate_peaks.R
+++ b/bin/annotate_peaks.R
@@ -1,50 +1,47 @@
 #!/usr/bin/env Rscript
 # adapted from: https://github.com/CCBR/ASPEN/blob/55f909d76500c3502c1c397ef3000908649b0284/workflow/scripts/ccbr_annotate_peaks.R
 load_package <- function(x) {
-    suppressPackageStartupMessages(library(x, character.only = TRUE))
+  suppressPackageStartupMessages(library(x, character.only = TRUE))
+  return()
 }
-lapply(c('dplyr', 'ChIPseeker'), load_package)
+messages <- lapply(c("ChIPseeker", "dplyr", "glue", "ggplot2"), load_package)
 
 parser <- argparse::ArgumentParser()
-parser$add_argument("-n", "--narrowpeak",
-  required = TRUE,
-  dest = "narrowpeak", help = "narrowpeak file"
-)
-parser$add_argument("-a", "--annotated",
-  required = TRUE, dest = "annotated",
-  help = "annotated output file"
-)
+parser$add_argument("-p", "--peak", required = TRUE, help = "peak file")
 parser$add_argument("-u", "--uptss", required = FALSE, type = "integer", default = 2000, help = "upstream bases from TSS")
 parser$add_argument("-d", "--downtss", required = FALSE, type = "integer", default = 2000, help = "upstream bases from TSS")
 parser$add_argument("-t", "--toppromoterpeaks", required = FALSE, type = "integer", default = 1000, help = "filter top N peaks in promoters for genelist output")
-parser$add_argument("-l", "--genelist", required = TRUE, help = "list of genes with peaks in promoter regions")
-parser$add_argument("-f", "--atypefreq", required = TRUE, help = "frequency of different annotation types")
-parser$add_argument("-g", "--genome",
-  required = TRUE, type = "character", dest = "genome",
-  help = "hg38/hg19/mm10/mm9/mmul10/bosTau9"
-)
+parser$add_argument("-o", "--outfile-prefix", required = TRUE, type = "character", dest = "outfile_prefix", help = "prefix for output filenames")
+parser$add_argument("-g", "--genome", required = TRUE, help = "hg38/hg19/mm10/mm9/mmul10/bosTau9")
 
 # get command line options, if help option encountered print help and exit,
 # otherwise if options not found on command line then set defaults,
 args <- parser$parse_args()
+outfile_prefix <- args$outfile_prefix
 
 genomes <- tibble::tribble(
-  ~ref_genome,            ~adb,                                  ~tdb,
-        "mm9",  "org.Mm.eg.db",   "TxDb.Mmusculus.UCSC.mm9.knownGene",
-       "mm10",  "org.Mm.eg.db",  "TxDb.Mmusculus.UCSC.mm10.knownGene",
-       "hg19",  "org.Hs.eg.db",   "TxDb.Hsapiens.UCSC.hg19.knownGene",
-       "hg38",  "org.Hs.eg.db",   "TxDb.Hsapiens.UCSC.hg38.knownGene",
-     "mmul10", "org.Mmu.eg.db", "TxDb.Mmulatta.UCSC.rheMac10.refGene",
-    "bosTau9",  "org.Bt.eg.db",   "TxDb.Btaurus.UCSC.bosTau9.refGene"
+  ~ref_genome, ~adb, ~tdb,
+  "mm9", "org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm9.knownGene",
+  "mm10", "org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm10.knownGene",
+  "hg19", "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg19.knownGene",
+  "hg38", "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg38.knownGene",
+  "mmul10", "org.Mmu.eg.db", "TxDb.Mmulatta.UCSC.rheMac10.refGene",
+  "bosTau9", "org.Bt.eg.db", "TxDb.Btaurus.UCSC.bosTau9.refGene"
 )
-adb <- genomes %>% filter(ref_genome == args$genome) %>% pull(adb)
+adb <- genomes %>%
+  filter(ref_genome == args$genome) %>%
+  pull(adb)
 load_package(adb)
-tdb_str <- genomes %>% filter(ref_genome == args$genome) %>% pull(tdb)
+tdb_str <- genomes %>%
+  filter(ref_genome == args$genome) %>%
+  pull(tdb)
 load_package(tdb_str)
-tdb <- tdb_str %>% rlang::sym() %>% eval()
+tdb <- tdb_str %>%
+  rlang::sym() %>%
+  eval()
 
-np <- read.table(args$narrowpeak, sep = "\t")
-peak_colnames  <- c(
+np <- read.table(args$peak, sep = "\t")
+peak_colnames <- c(
   "chrom",
   "chromStart",
   "chromEnd",
@@ -55,14 +52,14 @@ peak_colnames  <- c(
   "pValue",
   "qValue"
 )
-num_columns  <- ncol(np)
+num_columns <- ncol(np)
 if (num_columns == 9) {
-    colnames(np) <- peak_colnames
-    np <- np %>% mutate(peak = NA)
+  colnames(np) <- peak_colnames
+  np <- np %>% mutate(peak = NA)
 } else if (num_columns == 10) {
-    colnames(np) <- c(peak_colnames, "peak")
+  colnames(np) <- c(peak_colnames, "peak")
 } else {
-    stop(paste("Expected 9 or 10 columns in peak file, but", num_columns, "given"))
+  stop(paste("Expected 9 or 10 columns in peak file, but", num_columns, "given"))
 }
 np <- np[order(-np$qValue), ]
 np$peakID <- paste(np$chrom, ":", np$chromStart, "-", np$chromEnd, sep = "")
@@ -83,6 +80,7 @@ pa <- annotatePeak(
   ignoreDownstream = FALSE,
   overlap = "TSS"
 )
+saveRDS(pa, file = glue("{outfile_prefix}.annotation.Rds"))
 
 padf <- as.data.frame(pa)
 padf$peakID <- paste(padf$seqnames, ":", padf$start, "-", padf$end, sep = "")
@@ -143,13 +141,14 @@ colnames(merged) <- c(
 
 # merge annotation with narrowPeak file
 merged <- merged[order(-merged$qValue), ]
-write.table(merged, args$annotated, sep = "\t", quote = FALSE, row.names = FALSE)
+annotated_outfile <- glue("{outfile_prefix}.annotated.txt")
+write.table(merged, annotated_outfile, sep = "\t", quote = FALSE, row.names = FALSE)
 l <- paste("# Median peak width : ", median(merged$width), sep = "")
-write(l, args$annotated, append = TRUE)
+write(l, annotated_outfile, append = TRUE)
 l <- paste("# Median pValue : ", median(merged$pValue), sep = "")
-write(l, args$annotated, append = TRUE)
+write(l, annotated_outfile, append = TRUE)
 l <- paste("# Median qValue : ", median(merged$qValue), sep = "")
-write(l, args$annotated, append = TRUE)
+write(l, annotated_outfile, append = TRUE)
 
 
 # get promoter genes
@@ -162,18 +161,20 @@ promoters <- promoters[order(-promoters$qValue), ]
 promoters <- head(promoters, n = args$toppromoterpeaks)
 promoter_genes <- unique(promoters[, c("ENSEMBL", "SYMBOL")])
 colnames(promoter_genes) <- c("#ENSEMBL", "SYMBOL")
-write.table(promoter_genes, args$genelist, sep = "\t", quote = FALSE, row.names = FALSE)
+outfile_genelist <- glue("{outfile_prefix}.genelist.txt")
+write.table(promoter_genes, outfile_genelist, sep = "\t", quote = FALSE, row.names = FALSE)
 l <- paste("# Median peak width : ", median(promoters$width), sep = "")
-write(l, args$genelist, append = TRUE)
+write(l, outfile_genelist, append = TRUE)
 l <- paste("# Median pValue : ", median(promoters$pValue), sep = "")
-write(l, args$genelist, append = TRUE)
+write(l, outfile_genelist, append = TRUE)
 l <- paste("# Median qValue : ", median(promoters$qValue), sep = "")
-write(l, args$genelist, append = TRUE)
+write(l, outfile_genelist, append = TRUE)
 
 # annotation type frequency table
 
 l <- paste("#annotationType", "frequency", "medianWidth", "medianpValue", "medianqValue", sep = "\t")
-write(l, args$atypefreq)
+outfile_summary <- glue("{outfile_prefix}.summary.txt")
+write(l, outfile_summary)
 atypes <- c(
   "3' UTR",
   "5' UTR",
@@ -190,7 +191,7 @@ for (ann in atypes) {
   p <- median(x$pValue)
   q <- median(x$qValue)
   l <- paste(gsub(" ", "", ann), nrow(x), w, p, q, sep = "\t")
-  write(l, args$atypefreq, append = TRUE)
+  write(l, outfile_summary, append = TRUE)
 }
 for (ann in c("Exon", "Intron")) {
   x <- dplyr::filter(merged, grepl(ann, annotation))
@@ -198,5 +199,26 @@ for (ann in c("Exon", "Intron")) {
   p <- median(x$pValue)
   q <- median(x$qValue)
   l <- paste(gsub(" ", "", ann), nrow(x), w, p, q, sep = "\t")
-  write(l, args$atypefreq, append = TRUE)
+  write(l, outfile_summary, append = TRUE)
 }
+
+# plots for individual peak file
+peak <- readPeakFile(args$peak)
+plots <- list(
+  covplot = covplot(peak, weightCol = "V5"),
+  plotPeakProf2 = plotPeakProf2(
+    peak = peak, upstream = rel(0.2), downstream = rel(0.2),
+    conf = 0.95, by = "gene", type = "body", nbin = 800,
+    TxDb = txdb, weightCol = "V5", ignore_strand = F
+  ),
+  upsetplot = upsetplot(annot, vennpie = TRUE)
+)
+
+names(plots) %>%
+  lapply(function(plot_name) {
+    ggsave(
+      filename = glue("{outfile_prefix}_{plot_name}.png"),
+      plot = plots[[plot_name]],
+      device = "png"
+    )
+  })
diff --git a/bin/chipseeker_plotlist.R b/bin/chipseeker_plotlist.R
new file mode 100644
index 00000000..8ffe76fc
--- /dev/null
+++ b/bin/chipseeker_plotlist.R
@@ -0,0 +1,18 @@
+#!/usr/bin/env Rscript
+# chipseeker plots that accept list of annotation objects
+load_package <- function(x) {
+  suppressPackageStartupMessages(library(x, character.only = TRUE))
+}
+messages <- lapply(c("ChIPseeker", "dplyr", "ggplot2", "glue"), load_package)
+
+parser <- argparse::ArgumentParser()
+parser$add_argument("-a", "--annotations", required = TRUE, type = "character", dest = "annots", help = "space-delimited list of Rds files of peak annotation objects from chipseeker")
+parser$add_argument("-o", "--outfile", required = TRUE, type = "character", help = "output filenames")
+
+args <- parser$parse_args()
+
+peak_anno_list <- args$annotations %>%
+  str_split(" ") %>%
+  lapply(readRds)
+anno_bar_plot <- plotAnnoBar(peak_anno_list)
+ggsave(filename = args$outfile, plot = anno_bar_plot)
diff --git a/modules/local/chipseeker/annotate/main.nf b/modules/local/chipseeker/annotate/main.nf
index f70c7e94..7cb2099f 100644
--- a/modules/local/chipseeker/annotate/main.nf
+++ b/modules/local/chipseeker/annotate/main.nf
@@ -3,21 +3,21 @@ process CHIPSEEKER_ANNOTATE {
     label 'peaks'
     label 'process_medium'
 
-    container 'nciccbr/ccbr_atacseq:v1.10'
+    container 'nciccbr/ccbr_chipseeker:v1.0.1'
 
     input:
         tuple val(meta), path(bed)
 
     output:
-        tuple val(meta), path("*.annotated.txt"), path("*.summary.txt"), path("*.genelist.txt"), emit: annot
+        tuple val(meta), path("*.annotated.txt"), path("*.summary.txt"), path("*.genelist.txt"), emit: txt
+        path("*.annotation.Rds"),                                                                emit: annot
+        path("*.png"),                                                                           emit: plots
 
     script:
     """
     annotate_peaks.R \\
-        --narrowpeak ${bed} \\
-        --annotated ${meta.id}.${meta.group}.annotated.txt \\
-        --atypefreq ${meta.id}.${meta.group}.summary.txt \\
-        --genelist ${meta.id}.${meta.group}.genelist.txt \\
+        --peak ${bed} \\
+        --outfile-prefix ${meta.id}.${meta.group} \\
         --genome ${params.genome} \\
         --uptss 2000 \\
         --downtss 2000 \\
@@ -26,9 +26,9 @@ process CHIPSEEKER_ANNOTATE {
 
     stub:
     """
-    for ftype in annotated summary genelist
+    for ftype in annotated.txt summary.txt genelist.txt annotation.Rds .png
     do
-        touch ${meta.id}.${meta.group}.\${ftype}.txt
+        touch ${meta.id}.${meta.group}.\${ftype}
     done
     """
 }
diff --git a/modules/local/chipseeker/plotlist/main.nf b/modules/local/chipseeker/plotlist/main.nf
new file mode 100644
index 00000000..f176817d
--- /dev/null
+++ b/modules/local/chipseeker/plotlist/main.nf
@@ -0,0 +1,25 @@
+process CHIPSEEKER_PLOTLIST {
+    tag "${meta.id}.${meta.group}"
+    label 'peaks'
+    label 'process_medium'
+
+    container 'nciccbr/ccbr_chipseeker:v1.0.1'
+
+    input:
+        path(rds)
+
+    output:
+        path("*.png"), emit: plots
+
+    script:
+    """
+    chipseeker_plotlist.R \\
+        --annotations ${rds.join(' ')} \\
+        --outfile plot_anno_bar.png
+    """
+
+    stub:
+    """
+    touch plot_anno_bar.png
+    """
+}
diff --git a/subworkflows/local/peaks.nf b/subworkflows/local/peaks.nf
index 31e5941c..545172c9 100644
--- a/subworkflows/local/peaks.nf
+++ b/subworkflows/local/peaks.nf
@@ -20,6 +20,7 @@ include { CONSENSUS_PEAKS     } from "../../modules/local/consensus_peaks"
 include { HOMER_MOTIFS        } from "../../modules/local/homer"
 include { MEME_AME            } from "../../modules/local/meme"
 include { CHIPSEEKER_ANNOTATE } from "../../modules/local/chipseeker/annotate"
+include { CHIPSEEKER_PLOTLIST } from "../../modules/local/chipseeker/plotlist"
 
 workflow CALL_PEAKS {
     take:
@@ -151,17 +152,22 @@ workflow CALL_PEAKS {
 
         if (params.run.chipseeker) {
             CONSENSUS_PEAKS.out.peaks | CHIPSEEKER_ANNOTATE
+            CHIPSEEKER_ANNOTATE.out.annot.collect() | CHIPSEEKER_PLOTLIST
+            ch_plots = ch_plots.mix(
+                CHIPSEEKER_ANNOTATE.out.plots,
+                CHIPSEEKER_PLOTLIST.out.plots
+                )
         }
         if (params.run.homer) {
             HOMER_MOTIFS( CONSENSUS_PEAKS.out.peaks.combine(genome_fasta),
                         params.homer.de_novo,
                         file(params.homer.jaspar_db, checkIfExists: true)
                         )
-        }
-        if (params.genomes[ params.genome ].meme_motifs) {
-            MEME_AME( HOMER_MOTIFS.out.ame,
-                      file(params.genomes[ params.genome ].meme_motifs, checkIfExists: true)
-                    )
+            if (params.genomes[ params.genome ].meme_motifs) {
+                MEME_AME( HOMER_MOTIFS.out.ame,
+                        file(params.genomes[ params.genome ].meme_motifs, checkIfExists: true)
+                        )
+            }
         }
 
     emit: