From b00d9dfef9a855f3dd22f5f432b3ea1995c49c86 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 16 Nov 2023 10:03:37 -0500 Subject: [PATCH] feat: add chipseeker plots to annotation step separate process for plotAnnoBar from list of annotations --- bin/annotate_peaks.R | 110 +++++++++++++--------- bin/chipseeker_plotlist.R | 18 ++++ modules/local/chipseeker/annotate/main.nf | 16 ++-- modules/local/chipseeker/plotlist/main.nf | 25 +++++ subworkflows/local/peaks.nf | 16 +++- 5 files changed, 128 insertions(+), 57 deletions(-) create mode 100644 bin/chipseeker_plotlist.R create mode 100644 modules/local/chipseeker/plotlist/main.nf diff --git a/bin/annotate_peaks.R b/bin/annotate_peaks.R index 08ada7dc..96797f2b 100755 --- a/bin/annotate_peaks.R +++ b/bin/annotate_peaks.R @@ -1,50 +1,47 @@ #!/usr/bin/env Rscript # adapted from: https://github.com/CCBR/ASPEN/blob/55f909d76500c3502c1c397ef3000908649b0284/workflow/scripts/ccbr_annotate_peaks.R load_package <- function(x) { - suppressPackageStartupMessages(library(x, character.only = TRUE)) + suppressPackageStartupMessages(library(x, character.only = TRUE)) + return() } -lapply(c('dplyr', 'ChIPseeker'), load_package) +messages <- lapply(c("ChIPseeker", "dplyr", "glue", "ggplot2"), load_package) parser <- argparse::ArgumentParser() -parser$add_argument("-n", "--narrowpeak", - required = TRUE, - dest = "narrowpeak", help = "narrowpeak file" -) -parser$add_argument("-a", "--annotated", - required = TRUE, dest = "annotated", - help = "annotated output file" -) +parser$add_argument("-p", "--peak", required = TRUE, help = "peak file") parser$add_argument("-u", "--uptss", required = FALSE, type = "integer", default = 2000, help = "upstream bases from TSS") parser$add_argument("-d", "--downtss", required = FALSE, type = "integer", default = 2000, help = "upstream bases from TSS") parser$add_argument("-t", "--toppromoterpeaks", required = FALSE, type = "integer", default = 1000, help = "filter top N peaks in promoters for genelist output") -parser$add_argument("-l", "--genelist", required = TRUE, help = "list of genes with peaks in promoter regions") -parser$add_argument("-f", "--atypefreq", required = TRUE, help = "frequency of different annotation types") -parser$add_argument("-g", "--genome", - required = TRUE, type = "character", dest = "genome", - help = "hg38/hg19/mm10/mm9/mmul10/bosTau9" -) +parser$add_argument("-o", "--outfile-prefix", required = TRUE, type = "character", dest = "outfile_prefix", help = "prefix for output filenames") +parser$add_argument("-g", "--genome", required = TRUE, help = "hg38/hg19/mm10/mm9/mmul10/bosTau9") # get command line options, if help option encountered print help and exit, # otherwise if options not found on command line then set defaults, args <- parser$parse_args() +outfile_prefix <- args$outfile_prefix genomes <- tibble::tribble( - ~ref_genome, ~adb, ~tdb, - "mm9", "org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm9.knownGene", - "mm10", "org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm10.knownGene", - "hg19", "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg19.knownGene", - "hg38", "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg38.knownGene", - "mmul10", "org.Mmu.eg.db", "TxDb.Mmulatta.UCSC.rheMac10.refGene", - "bosTau9", "org.Bt.eg.db", "TxDb.Btaurus.UCSC.bosTau9.refGene" + ~ref_genome, ~adb, ~tdb, + "mm9", "org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm9.knownGene", + "mm10", "org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm10.knownGene", + "hg19", "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg19.knownGene", + "hg38", "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg38.knownGene", + "mmul10", "org.Mmu.eg.db", "TxDb.Mmulatta.UCSC.rheMac10.refGene", + "bosTau9", "org.Bt.eg.db", "TxDb.Btaurus.UCSC.bosTau9.refGene" ) -adb <- genomes %>% filter(ref_genome == args$genome) %>% pull(adb) +adb <- genomes %>% + filter(ref_genome == args$genome) %>% + pull(adb) load_package(adb) -tdb_str <- genomes %>% filter(ref_genome == args$genome) %>% pull(tdb) +tdb_str <- genomes %>% + filter(ref_genome == args$genome) %>% + pull(tdb) load_package(tdb_str) -tdb <- tdb_str %>% rlang::sym() %>% eval() +tdb <- tdb_str %>% + rlang::sym() %>% + eval() -np <- read.table(args$narrowpeak, sep = "\t") -peak_colnames <- c( +np <- read.table(args$peak, sep = "\t") +peak_colnames <- c( "chrom", "chromStart", "chromEnd", @@ -55,14 +52,14 @@ peak_colnames <- c( "pValue", "qValue" ) -num_columns <- ncol(np) +num_columns <- ncol(np) if (num_columns == 9) { - colnames(np) <- peak_colnames - np <- np %>% mutate(peak = NA) + colnames(np) <- peak_colnames + np <- np %>% mutate(peak = NA) } else if (num_columns == 10) { - colnames(np) <- c(peak_colnames, "peak") + colnames(np) <- c(peak_colnames, "peak") } else { - stop(paste("Expected 9 or 10 columns in peak file, but", num_columns, "given")) + stop(paste("Expected 9 or 10 columns in peak file, but", num_columns, "given")) } np <- np[order(-np$qValue), ] np$peakID <- paste(np$chrom, ":", np$chromStart, "-", np$chromEnd, sep = "") @@ -83,6 +80,7 @@ pa <- annotatePeak( ignoreDownstream = FALSE, overlap = "TSS" ) +saveRDS(pa, file = glue("{outfile_prefix}.annotation.Rds")) padf <- as.data.frame(pa) padf$peakID <- paste(padf$seqnames, ":", padf$start, "-", padf$end, sep = "") @@ -143,13 +141,14 @@ colnames(merged) <- c( # merge annotation with narrowPeak file merged <- merged[order(-merged$qValue), ] -write.table(merged, args$annotated, sep = "\t", quote = FALSE, row.names = FALSE) +annotated_outfile <- glue("{outfile_prefix}.annotated.txt") +write.table(merged, annotated_outfile, sep = "\t", quote = FALSE, row.names = FALSE) l <- paste("# Median peak width : ", median(merged$width), sep = "") -write(l, args$annotated, append = TRUE) +write(l, annotated_outfile, append = TRUE) l <- paste("# Median pValue : ", median(merged$pValue), sep = "") -write(l, args$annotated, append = TRUE) +write(l, annotated_outfile, append = TRUE) l <- paste("# Median qValue : ", median(merged$qValue), sep = "") -write(l, args$annotated, append = TRUE) +write(l, annotated_outfile, append = TRUE) # get promoter genes @@ -162,18 +161,20 @@ promoters <- promoters[order(-promoters$qValue), ] promoters <- head(promoters, n = args$toppromoterpeaks) promoter_genes <- unique(promoters[, c("ENSEMBL", "SYMBOL")]) colnames(promoter_genes) <- c("#ENSEMBL", "SYMBOL") -write.table(promoter_genes, args$genelist, sep = "\t", quote = FALSE, row.names = FALSE) +outfile_genelist <- glue("{outfile_prefix}.genelist.txt") +write.table(promoter_genes, outfile_genelist, sep = "\t", quote = FALSE, row.names = FALSE) l <- paste("# Median peak width : ", median(promoters$width), sep = "") -write(l, args$genelist, append = TRUE) +write(l, outfile_genelist, append = TRUE) l <- paste("# Median pValue : ", median(promoters$pValue), sep = "") -write(l, args$genelist, append = TRUE) +write(l, outfile_genelist, append = TRUE) l <- paste("# Median qValue : ", median(promoters$qValue), sep = "") -write(l, args$genelist, append = TRUE) +write(l, outfile_genelist, append = TRUE) # annotation type frequency table l <- paste("#annotationType", "frequency", "medianWidth", "medianpValue", "medianqValue", sep = "\t") -write(l, args$atypefreq) +outfile_summary <- glue("{outfile_prefix}.summary.txt") +write(l, outfile_summary) atypes <- c( "3' UTR", "5' UTR", @@ -190,7 +191,7 @@ for (ann in atypes) { p <- median(x$pValue) q <- median(x$qValue) l <- paste(gsub(" ", "", ann), nrow(x), w, p, q, sep = "\t") - write(l, args$atypefreq, append = TRUE) + write(l, outfile_summary, append = TRUE) } for (ann in c("Exon", "Intron")) { x <- dplyr::filter(merged, grepl(ann, annotation)) @@ -198,5 +199,26 @@ for (ann in c("Exon", "Intron")) { p <- median(x$pValue) q <- median(x$qValue) l <- paste(gsub(" ", "", ann), nrow(x), w, p, q, sep = "\t") - write(l, args$atypefreq, append = TRUE) + write(l, outfile_summary, append = TRUE) } + +# plots for individual peak file +peak <- readPeakFile(args$peak) +plots <- list( + covplot = covplot(peak, weightCol = "V5"), + plotPeakProf2 = plotPeakProf2( + peak = peak, upstream = rel(0.2), downstream = rel(0.2), + conf = 0.95, by = "gene", type = "body", nbin = 800, + TxDb = txdb, weightCol = "V5", ignore_strand = F + ), + upsetplot = upsetplot(annot, vennpie = TRUE) +) + +names(plots) %>% + lapply(function(plot_name) { + ggsave( + filename = glue("{outfile_prefix}_{plot_name}.png"), + plot = plots[[plot_name]], + device = "png" + ) + }) diff --git a/bin/chipseeker_plotlist.R b/bin/chipseeker_plotlist.R new file mode 100644 index 00000000..8ffe76fc --- /dev/null +++ b/bin/chipseeker_plotlist.R @@ -0,0 +1,18 @@ +#!/usr/bin/env Rscript +# chipseeker plots that accept list of annotation objects +load_package <- function(x) { + suppressPackageStartupMessages(library(x, character.only = TRUE)) +} +messages <- lapply(c("ChIPseeker", "dplyr", "ggplot2", "glue"), load_package) + +parser <- argparse::ArgumentParser() +parser$add_argument("-a", "--annotations", required = TRUE, type = "character", dest = "annots", help = "space-delimited list of Rds files of peak annotation objects from chipseeker") +parser$add_argument("-o", "--outfile", required = TRUE, type = "character", help = "output filenames") + +args <- parser$parse_args() + +peak_anno_list <- args$annotations %>% + str_split(" ") %>% + lapply(readRds) +anno_bar_plot <- plotAnnoBar(peak_anno_list) +ggsave(filename = args$outfile, plot = anno_bar_plot) diff --git a/modules/local/chipseeker/annotate/main.nf b/modules/local/chipseeker/annotate/main.nf index f70c7e94..7cb2099f 100644 --- a/modules/local/chipseeker/annotate/main.nf +++ b/modules/local/chipseeker/annotate/main.nf @@ -3,21 +3,21 @@ process CHIPSEEKER_ANNOTATE { label 'peaks' label 'process_medium' - container 'nciccbr/ccbr_atacseq:v1.10' + container 'nciccbr/ccbr_chipseeker:v1.0.1' input: tuple val(meta), path(bed) output: - tuple val(meta), path("*.annotated.txt"), path("*.summary.txt"), path("*.genelist.txt"), emit: annot + tuple val(meta), path("*.annotated.txt"), path("*.summary.txt"), path("*.genelist.txt"), emit: txt + path("*.annotation.Rds"), emit: annot + path("*.png"), emit: plots script: """ annotate_peaks.R \\ - --narrowpeak ${bed} \\ - --annotated ${meta.id}.${meta.group}.annotated.txt \\ - --atypefreq ${meta.id}.${meta.group}.summary.txt \\ - --genelist ${meta.id}.${meta.group}.genelist.txt \\ + --peak ${bed} \\ + --outfile-prefix ${meta.id}.${meta.group} \\ --genome ${params.genome} \\ --uptss 2000 \\ --downtss 2000 \\ @@ -26,9 +26,9 @@ process CHIPSEEKER_ANNOTATE { stub: """ - for ftype in annotated summary genelist + for ftype in annotated.txt summary.txt genelist.txt annotation.Rds .png do - touch ${meta.id}.${meta.group}.\${ftype}.txt + touch ${meta.id}.${meta.group}.\${ftype} done """ } diff --git a/modules/local/chipseeker/plotlist/main.nf b/modules/local/chipseeker/plotlist/main.nf new file mode 100644 index 00000000..f176817d --- /dev/null +++ b/modules/local/chipseeker/plotlist/main.nf @@ -0,0 +1,25 @@ +process CHIPSEEKER_PLOTLIST { + tag "${meta.id}.${meta.group}" + label 'peaks' + label 'process_medium' + + container 'nciccbr/ccbr_chipseeker:v1.0.1' + + input: + path(rds) + + output: + path("*.png"), emit: plots + + script: + """ + chipseeker_plotlist.R \\ + --annotations ${rds.join(' ')} \\ + --outfile plot_anno_bar.png + """ + + stub: + """ + touch plot_anno_bar.png + """ +} diff --git a/subworkflows/local/peaks.nf b/subworkflows/local/peaks.nf index 31e5941c..545172c9 100644 --- a/subworkflows/local/peaks.nf +++ b/subworkflows/local/peaks.nf @@ -20,6 +20,7 @@ include { CONSENSUS_PEAKS } from "../../modules/local/consensus_peaks" include { HOMER_MOTIFS } from "../../modules/local/homer" include { MEME_AME } from "../../modules/local/meme" include { CHIPSEEKER_ANNOTATE } from "../../modules/local/chipseeker/annotate" +include { CHIPSEEKER_PLOTLIST } from "../../modules/local/chipseeker/plotlist" workflow CALL_PEAKS { take: @@ -151,17 +152,22 @@ workflow CALL_PEAKS { if (params.run.chipseeker) { CONSENSUS_PEAKS.out.peaks | CHIPSEEKER_ANNOTATE + CHIPSEEKER_ANNOTATE.out.annot.collect() | CHIPSEEKER_PLOTLIST + ch_plots = ch_plots.mix( + CHIPSEEKER_ANNOTATE.out.plots, + CHIPSEEKER_PLOTLIST.out.plots + ) } if (params.run.homer) { HOMER_MOTIFS( CONSENSUS_PEAKS.out.peaks.combine(genome_fasta), params.homer.de_novo, file(params.homer.jaspar_db, checkIfExists: true) ) - } - if (params.genomes[ params.genome ].meme_motifs) { - MEME_AME( HOMER_MOTIFS.out.ame, - file(params.genomes[ params.genome ].meme_motifs, checkIfExists: true) - ) + if (params.genomes[ params.genome ].meme_motifs) { + MEME_AME( HOMER_MOTIFS.out.ame, + file(params.genomes[ params.genome ].meme_motifs, checkIfExists: true) + ) + } } emit: