Skip to content

Commit

Permalink
feat: add chipseeker plots to annotation step
Browse files Browse the repository at this point in the history
separate process for plotAnnoBar from list of annotations
  • Loading branch information
kelly-sovacool committed Nov 16, 2023
1 parent 1d43f03 commit b00d9df
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 57 deletions.
110 changes: 66 additions & 44 deletions bin/annotate_peaks.R
Original file line number Diff line number Diff line change
@@ -1,50 +1,47 @@
#!/usr/bin/env Rscript
# adapted from: https://github.com/CCBR/ASPEN/blob/55f909d76500c3502c1c397ef3000908649b0284/workflow/scripts/ccbr_annotate_peaks.R
load_package <- function(x) {
suppressPackageStartupMessages(library(x, character.only = TRUE))
suppressPackageStartupMessages(library(x, character.only = TRUE))
return()
}
lapply(c('dplyr', 'ChIPseeker'), load_package)
messages <- lapply(c("ChIPseeker", "dplyr", "glue", "ggplot2"), load_package)

parser <- argparse::ArgumentParser()
parser$add_argument("-n", "--narrowpeak",
required = TRUE,
dest = "narrowpeak", help = "narrowpeak file"
)
parser$add_argument("-a", "--annotated",
required = TRUE, dest = "annotated",
help = "annotated output file"
)
parser$add_argument("-p", "--peak", required = TRUE, help = "peak file")
parser$add_argument("-u", "--uptss", required = FALSE, type = "integer", default = 2000, help = "upstream bases from TSS")
parser$add_argument("-d", "--downtss", required = FALSE, type = "integer", default = 2000, help = "upstream bases from TSS")
parser$add_argument("-t", "--toppromoterpeaks", required = FALSE, type = "integer", default = 1000, help = "filter top N peaks in promoters for genelist output")
parser$add_argument("-l", "--genelist", required = TRUE, help = "list of genes with peaks in promoter regions")
parser$add_argument("-f", "--atypefreq", required = TRUE, help = "frequency of different annotation types")
parser$add_argument("-g", "--genome",
required = TRUE, type = "character", dest = "genome",
help = "hg38/hg19/mm10/mm9/mmul10/bosTau9"
)
parser$add_argument("-o", "--outfile-prefix", required = TRUE, type = "character", dest = "outfile_prefix", help = "prefix for output filenames")
parser$add_argument("-g", "--genome", required = TRUE, help = "hg38/hg19/mm10/mm9/mmul10/bosTau9")

# get command line options, if help option encountered print help and exit,
# otherwise if options not found on command line then set defaults,
args <- parser$parse_args()
outfile_prefix <- args$outfile_prefix

genomes <- tibble::tribble(
~ref_genome, ~adb, ~tdb,
"mm9", "org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm9.knownGene",
"mm10", "org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm10.knownGene",
"hg19", "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg19.knownGene",
"hg38", "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg38.knownGene",
"mmul10", "org.Mmu.eg.db", "TxDb.Mmulatta.UCSC.rheMac10.refGene",
"bosTau9", "org.Bt.eg.db", "TxDb.Btaurus.UCSC.bosTau9.refGene"
~ref_genome, ~adb, ~tdb,
"mm9", "org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm9.knownGene",
"mm10", "org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm10.knownGene",
"hg19", "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg19.knownGene",
"hg38", "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg38.knownGene",
"mmul10", "org.Mmu.eg.db", "TxDb.Mmulatta.UCSC.rheMac10.refGene",
"bosTau9", "org.Bt.eg.db", "TxDb.Btaurus.UCSC.bosTau9.refGene"
)
adb <- genomes %>% filter(ref_genome == args$genome) %>% pull(adb)
adb <- genomes %>%
filter(ref_genome == args$genome) %>%
pull(adb)
load_package(adb)
tdb_str <- genomes %>% filter(ref_genome == args$genome) %>% pull(tdb)
tdb_str <- genomes %>%
filter(ref_genome == args$genome) %>%
pull(tdb)
load_package(tdb_str)
tdb <- tdb_str %>% rlang::sym() %>% eval()
tdb <- tdb_str %>%
rlang::sym() %>%
eval()

np <- read.table(args$narrowpeak, sep = "\t")
peak_colnames <- c(
np <- read.table(args$peak, sep = "\t")
peak_colnames <- c(
"chrom",
"chromStart",
"chromEnd",
Expand All @@ -55,14 +52,14 @@ peak_colnames <- c(
"pValue",
"qValue"
)
num_columns <- ncol(np)
num_columns <- ncol(np)
if (num_columns == 9) {
colnames(np) <- peak_colnames
np <- np %>% mutate(peak = NA)
colnames(np) <- peak_colnames
np <- np %>% mutate(peak = NA)
} else if (num_columns == 10) {
colnames(np) <- c(peak_colnames, "peak")
colnames(np) <- c(peak_colnames, "peak")
} else {
stop(paste("Expected 9 or 10 columns in peak file, but", num_columns, "given"))
stop(paste("Expected 9 or 10 columns in peak file, but", num_columns, "given"))
}
np <- np[order(-np$qValue), ]
np$peakID <- paste(np$chrom, ":", np$chromStart, "-", np$chromEnd, sep = "")
Expand All @@ -83,6 +80,7 @@ pa <- annotatePeak(
ignoreDownstream = FALSE,
overlap = "TSS"
)
saveRDS(pa, file = glue("{outfile_prefix}.annotation.Rds"))

padf <- as.data.frame(pa)
padf$peakID <- paste(padf$seqnames, ":", padf$start, "-", padf$end, sep = "")
Expand Down Expand Up @@ -143,13 +141,14 @@ colnames(merged) <- c(

# merge annotation with narrowPeak file
merged <- merged[order(-merged$qValue), ]
write.table(merged, args$annotated, sep = "\t", quote = FALSE, row.names = FALSE)
annotated_outfile <- glue("{outfile_prefix}.annotated.txt")
write.table(merged, annotated_outfile, sep = "\t", quote = FALSE, row.names = FALSE)
l <- paste("# Median peak width : ", median(merged$width), sep = "")
write(l, args$annotated, append = TRUE)
write(l, annotated_outfile, append = TRUE)
l <- paste("# Median pValue : ", median(merged$pValue), sep = "")
write(l, args$annotated, append = TRUE)
write(l, annotated_outfile, append = TRUE)
l <- paste("# Median qValue : ", median(merged$qValue), sep = "")
write(l, args$annotated, append = TRUE)
write(l, annotated_outfile, append = TRUE)


# get promoter genes
Expand All @@ -162,18 +161,20 @@ promoters <- promoters[order(-promoters$qValue), ]
promoters <- head(promoters, n = args$toppromoterpeaks)
promoter_genes <- unique(promoters[, c("ENSEMBL", "SYMBOL")])
colnames(promoter_genes) <- c("#ENSEMBL", "SYMBOL")
write.table(promoter_genes, args$genelist, sep = "\t", quote = FALSE, row.names = FALSE)
outfile_genelist <- glue("{outfile_prefix}.genelist.txt")
write.table(promoter_genes, outfile_genelist, sep = "\t", quote = FALSE, row.names = FALSE)
l <- paste("# Median peak width : ", median(promoters$width), sep = "")
write(l, args$genelist, append = TRUE)
write(l, outfile_genelist, append = TRUE)
l <- paste("# Median pValue : ", median(promoters$pValue), sep = "")
write(l, args$genelist, append = TRUE)
write(l, outfile_genelist, append = TRUE)
l <- paste("# Median qValue : ", median(promoters$qValue), sep = "")
write(l, args$genelist, append = TRUE)
write(l, outfile_genelist, append = TRUE)

# annotation type frequency table

l <- paste("#annotationType", "frequency", "medianWidth", "medianpValue", "medianqValue", sep = "\t")
write(l, args$atypefreq)
outfile_summary <- glue("{outfile_prefix}.summary.txt")
write(l, outfile_summary)
atypes <- c(
"3' UTR",
"5' UTR",
Expand All @@ -190,13 +191,34 @@ for (ann in atypes) {
p <- median(x$pValue)
q <- median(x$qValue)
l <- paste(gsub(" ", "", ann), nrow(x), w, p, q, sep = "\t")
write(l, args$atypefreq, append = TRUE)
write(l, outfile_summary, append = TRUE)
}
for (ann in c("Exon", "Intron")) {
x <- dplyr::filter(merged, grepl(ann, annotation))
w <- median(x$width)
p <- median(x$pValue)
q <- median(x$qValue)
l <- paste(gsub(" ", "", ann), nrow(x), w, p, q, sep = "\t")
write(l, args$atypefreq, append = TRUE)
write(l, outfile_summary, append = TRUE)
}

# plots for individual peak file
peak <- readPeakFile(args$peak)
plots <- list(
covplot = covplot(peak, weightCol = "V5"),
plotPeakProf2 = plotPeakProf2(
peak = peak, upstream = rel(0.2), downstream = rel(0.2),
conf = 0.95, by = "gene", type = "body", nbin = 800,
TxDb = txdb, weightCol = "V5", ignore_strand = F
),
upsetplot = upsetplot(annot, vennpie = TRUE)
)

names(plots) %>%
lapply(function(plot_name) {
ggsave(
filename = glue("{outfile_prefix}_{plot_name}.png"),
plot = plots[[plot_name]],
device = "png"
)
})
18 changes: 18 additions & 0 deletions bin/chipseeker_plotlist.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env Rscript
# chipseeker plots that accept list of annotation objects
load_package <- function(x) {
suppressPackageStartupMessages(library(x, character.only = TRUE))
}
messages <- lapply(c("ChIPseeker", "dplyr", "ggplot2", "glue"), load_package)

parser <- argparse::ArgumentParser()
parser$add_argument("-a", "--annotations", required = TRUE, type = "character", dest = "annots", help = "space-delimited list of Rds files of peak annotation objects from chipseeker")
parser$add_argument("-o", "--outfile", required = TRUE, type = "character", help = "output filenames")

args <- parser$parse_args()

peak_anno_list <- args$annotations %>%
str_split(" ") %>%
lapply(readRds)
anno_bar_plot <- plotAnnoBar(peak_anno_list)
ggsave(filename = args$outfile, plot = anno_bar_plot)
16 changes: 8 additions & 8 deletions modules/local/chipseeker/annotate/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,21 @@ process CHIPSEEKER_ANNOTATE {
label 'peaks'
label 'process_medium'

container 'nciccbr/ccbr_atacseq:v1.10'
container 'nciccbr/ccbr_chipseeker:v1.0.1'

input:
tuple val(meta), path(bed)

output:
tuple val(meta), path("*.annotated.txt"), path("*.summary.txt"), path("*.genelist.txt"), emit: annot
tuple val(meta), path("*.annotated.txt"), path("*.summary.txt"), path("*.genelist.txt"), emit: txt
path("*.annotation.Rds"), emit: annot
path("*.png"), emit: plots

script:
"""
annotate_peaks.R \\
--narrowpeak ${bed} \\
--annotated ${meta.id}.${meta.group}.annotated.txt \\
--atypefreq ${meta.id}.${meta.group}.summary.txt \\
--genelist ${meta.id}.${meta.group}.genelist.txt \\
--peak ${bed} \\
--outfile-prefix ${meta.id}.${meta.group} \\
--genome ${params.genome} \\
--uptss 2000 \\
--downtss 2000 \\
Expand All @@ -26,9 +26,9 @@ process CHIPSEEKER_ANNOTATE {

stub:
"""
for ftype in annotated summary genelist
for ftype in annotated.txt summary.txt genelist.txt annotation.Rds .png
do
touch ${meta.id}.${meta.group}.\${ftype}.txt
touch ${meta.id}.${meta.group}.\${ftype}
done
"""
}
25 changes: 25 additions & 0 deletions modules/local/chipseeker/plotlist/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
process CHIPSEEKER_PLOTLIST {
tag "${meta.id}.${meta.group}"
label 'peaks'
label 'process_medium'

container 'nciccbr/ccbr_chipseeker:v1.0.1'

input:
path(rds)

output:
path("*.png"), emit: plots

script:
"""
chipseeker_plotlist.R \\
--annotations ${rds.join(' ')} \\
--outfile plot_anno_bar.png
"""

stub:
"""
touch plot_anno_bar.png
"""
}
16 changes: 11 additions & 5 deletions subworkflows/local/peaks.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ include { CONSENSUS_PEAKS } from "../../modules/local/consensus_peaks"
include { HOMER_MOTIFS } from "../../modules/local/homer"
include { MEME_AME } from "../../modules/local/meme"
include { CHIPSEEKER_ANNOTATE } from "../../modules/local/chipseeker/annotate"
include { CHIPSEEKER_PLOTLIST } from "../../modules/local/chipseeker/plotlist"

workflow CALL_PEAKS {
take:
Expand Down Expand Up @@ -151,17 +152,22 @@ workflow CALL_PEAKS {

if (params.run.chipseeker) {
CONSENSUS_PEAKS.out.peaks | CHIPSEEKER_ANNOTATE
CHIPSEEKER_ANNOTATE.out.annot.collect() | CHIPSEEKER_PLOTLIST
ch_plots = ch_plots.mix(
CHIPSEEKER_ANNOTATE.out.plots,
CHIPSEEKER_PLOTLIST.out.plots
)
}
if (params.run.homer) {
HOMER_MOTIFS( CONSENSUS_PEAKS.out.peaks.combine(genome_fasta),
params.homer.de_novo,
file(params.homer.jaspar_db, checkIfExists: true)
)
}
if (params.genomes[ params.genome ].meme_motifs) {
MEME_AME( HOMER_MOTIFS.out.ame,
file(params.genomes[ params.genome ].meme_motifs, checkIfExists: true)
)
if (params.genomes[ params.genome ].meme_motifs) {
MEME_AME( HOMER_MOTIFS.out.ame,
file(params.genomes[ params.genome ].meme_motifs, checkIfExists: true)
)
}
}

emit:
Expand Down

0 comments on commit b00d9df

Please sign in to comment.