diff --git a/bin/chipseeker_annotate.R b/bin/chipseeker_annotate.R index 142e84d0..50ec1451 100755 --- a/bin/chipseeker_annotate.R +++ b/bin/chipseeker_annotate.R @@ -26,6 +26,7 @@ load_package(args$txdb) txdb <- args$txdb %>% rlang::sym() %>% eval() +dir.create(file.path(outfile_prefix), showWarnings = FALSE) # parse peak file np <- read.table(args$peak, sep = "\t") @@ -71,7 +72,7 @@ annot <- annotatePeak( ignoreDownstream = FALSE, overlap = "TSS" ) -saveRDS(annot, file = glue("{outfile_prefix}.annotation.Rds")) +saveRDS(annot, file = glue("{outfile_prefix}/{outfile_prefix}.annotation.Rds")) padf <- as.data.frame(annot) padf$peakID <- paste(padf$seqnames, ":", padf$start, "-", padf$end, sep = "") @@ -103,7 +104,7 @@ merged <- dplyr::full_join(padf, np, by = "peakID") %>% dplyr::rename("#peakID" = "peakID") %>% dplyr::arrange(dplyr::desc(qValue)) -annotated_outfile <- glue("{outfile_prefix}.annotated.txt") +annotated_outfile <- glue("{outfile_prefix}/{outfile_prefix}.annotated.txt") write.table(merged, annotated_outfile, sep = "\t", quote = FALSE, row.names = FALSE) l <- paste("# Median peak width : ", median(merged$width), sep = "") write(l, annotated_outfile, append = TRUE) @@ -114,7 +115,7 @@ write(l, annotated_outfile, append = TRUE) # get promoter genes -outfile_genelist <- glue("{outfile_prefix}.genelist.txt") +outfile_genelist <- glue("{outfile_prefix}/{outfile_prefix}.genelist.txt") # ... all lines with annotation starting with "Promoter" promoters1 <- dplyr::filter(merged, grepl("Promoter", annotation)) # ... all lines with annotation is "5' UTR" @@ -138,7 +139,7 @@ write(l, outfile_genelist, append = TRUE) # annotation type frequency table l <- paste("#annotationType", "frequency", "medianWidth", "medianpValue", "medianqValue", sep = "\t") -outfile_summary <- glue("{outfile_prefix}.summary.txt") +outfile_summary <- glue("{outfile_prefix}/{outfile_prefix}.summary.txt") write(l, outfile_summary) atypes <- c( "3' UTR", @@ -169,7 +170,7 @@ for (ann in c("Exon", "Intron")) { # upset plot ggsave( - filename = glue("{outfile_prefix}_upsetplot.png"), + filename = glue("{outfile_prefix}/{outfile_prefix}_upsetplot.png"), plot = upsetplot(annot, vennpie = TRUE), device = "png" ) diff --git a/conf/full_mm10.config b/conf/full_mm10.config index 5d5c7de9..93e6c055 100644 --- a/conf/full_mm10.config +++ b/conf/full_mm10.config @@ -5,7 +5,8 @@ params { genome = 'mm10' outdir = "results/full_mm10" input = "${projectDir}/assets/samplesheet_full_mm10.csv" - contrasts = "${projectDir}/assets/contrasts_full_mm10.yml" + contrasts = 'true' + contrastsheet = "${projectDir}/assets/contrasts_full_mm10.yml" sicer { species = "mm10" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py } diff --git a/conf/test_human.config b/conf/test_human.config index 2af4f0a8..0ea1f6bb 100644 --- a/conf/test_human.config +++ b/conf/test_human.config @@ -5,7 +5,7 @@ params { genome = 'hg38' outdir = "results/human" input = "assets/samplesheet_human.csv" - contrasts = null //'assets/contrasts_human.yml' + contrasts = false //'assets/contrasts_human.yml' //read_length = 50 sicer.species = "${params.genome}" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py diff --git a/conf/test_mm10.config b/conf/test_mm10.config index 19f66705..421ff225 100644 --- a/conf/test_mm10.config +++ b/conf/test_mm10.config @@ -5,7 +5,8 @@ params { genome = 'mm10' outdir = "results/test_mm10" input = "${projectDir}/assets/samplesheet_test_mm10.csv" - contrasts = "${projectDir}/assets/contrasts_test_mm10.yml" + contrasts = 'true' + contrastsheet = "${projectDir}/assets/contrasts_test_mm10.yml" read_length = 50 sicer.species = "mm10" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py diff --git a/docs/manifests.md b/docs/manifests.md new file mode 100644 index 00000000..485ebe19 --- /dev/null +++ b/docs/manifests.md @@ -0,0 +1,30 @@ +# TODO Jotting notes here +## Samplemanifest +The following columns are required: + +- sample: sampleID; does not need to be a unique column +- rep: replicateID of sampleID; does not need to be a unique column +- fastq_1: absolute path to R1 of sampleID +- fastq_2: absolute path to R1 of sampleID +- antibody: -c sampleID for mac2; this must match a unique {sample}_{rep} format +- control: + +Example antibody / control format for a single-end project: + +``` +sample,rep,fastq_1,fastq_2,antibody,control +sample,1,/path/to/sample_1.R1.fastq.gz,,input_1,input_1 +sample,2,/path/to/sample_2.R1.fastq.gz,,input_1,input_1 +input,1,/path/to/sample1.R1.fastq.gz,,, +input,2,/path/to/sample1.R1.fastq.gz,,, +``` + +Example antibody / control format for a paired-end project: + +``` +sample,rep,fastq_1,fastq_2,antibody,control +sample,1,/path/to/sample_1.R1.fastq.gz,/path/to/sample_1.R2.fastq.gz,input_1,input_1 +sample,2,/path/to/sample_2.R1.fastq.gz,/path/to/sample_1.R2.fastq.gz,input_1,input_1 +input,1,/path/to/input_1.R1.fastq.gz,/path/to/input_1.R2.fastq.gz,, +input,2,/path/to/input_2.R1.fastq.gz,/path/to/input_2.R2.fastq.gz,, +``` \ No newline at end of file diff --git a/docs/workflow.md b/docs/workflow.md new file mode 100644 index 00000000..be2a72d8 --- /dev/null +++ b/docs/workflow.md @@ -0,0 +1,131 @@ +## Process workflow + +TODO add images to show workflow + +### Pipeline Checks +- Input files are checked that they meet standard formatting; some file access is reviewed + +- Processes include: + + - INPUT_CHECK:SAMPLESHEET_CHECK + - INPUT_CHECK:CHECK_CONTRASTS + +- Output directories include: + + - check_contrasts + +## Pre-alignment +- Adaptors are trimmed, if blacklists are included, filtering occurs + +- Processes include: + + - CUTADAPT + - FILTER_BLACKLIST:BWA_MEM + - FILTER_BLACKLIST:SAMTOOLS_FILTERALIGNED + - FILTER_BLACKLIST:PICARD_SAMTOFASTQ + - FILTER_BLACKLIST:CUSTOM_COUNTFASTQ + +- Output directories include: + + - cutadapt + +## Alignment +- Samples are aligned using BWA; alignment stats are generated; samples are sorted and filtered + +- Processes include: + + - ALIGN_GENOME:BWA_MEM + - ALIGN_GENOME:SAMTOOLS_FLAGSTAT_ALIGN + - ALIGN_GENOME:FILTER_QUALITY + - ALIGN_GENOME:SAMTOOLS_SORT + - ALIGN_GENOME:SAMTOOLS_FLAGSTAT_FILTER + +- Output directories include: + + - bwa_mem + - samtools_flagstat_align + - samtools_filteraligned + - samtools_sort + - samtools_flagstat_filter + +## Deduplicate +- Processes include: + + - DEDUPLICATE:MACS2_DEDUP + - DEDUPLICATE:INDEX_SINGLE + - DEDUPLICATE:PICARD_DEDUP + - DEDUPLICATE:INDEX_PAIRED + +- Output directories include: + +## Quality Control +- Processes include: + + - PPQT_PROCESS + - QC:FASTQC_RAW + - QC:FASTQC_TRIMMED + - QC:FASTQ_SCREE + - QC:PRESEQ + - QC:HANDLE_PRESEQ_ERROR + - QC:PARSE_PRESEQ_LOG + - QC:QC_STATS + - QC:QC_TABLE + +## Deeptools analysis +- Processes include: + + - QC:DEEPTOOLS:BAM_COVERAGE + - QC:DEEPTOOLS:BIGWIG_SUM + - QC:DEEPTOOLS:PLOT_CORRELATION + - QC:DEEPTOOLS:PLOT_PCA + - QC:DEEPTOOLS:NORMALIZE_INPUT + - QC:DEEPTOOLS:BED_PROTEIN_CODING + - QC:DEEPTOOLS:COMPUTE_MATRIX + - QC:DEEPTOOLS:PLOT_HEATMAP + - QC:DEEPTOOLS:PLOT_PROFILE + - QC:DEEPTOOLS:PLOT_FINGERPRINT + +## Peak calling +- Processes include: + + - PHANTOM_PEAKS + - CALL_PEAKS:CALC_GENOME_FRAC + - CALL_PEAKS:BAM_TO_BED + - CALL_PEAKS:MACS_BROAD + - CALL_PEAKS:MACS_NARROW + - CALL_PEAKS:SICER + - CALL_PEAKS:CONVERT_SICER + - CALL_PEAKS:GEM + - CALL_PEAKS:FILTER_GEM + - CALL_PEAKS:FRACTION_IN_PEAKS + - CALL_PEAKS:CONCAT_FRIPS + - CALL_PEAKS:PLOT_FRIP + - CALL_PEAKS:GET_PEAK_META + - CALL_PEAKS:CONCAT_PEAK_META + - CALL_PEAKS:PLOT_PEAK_WIDTHS + +## Consensus Peaks +- Processes include: + + - CONSENSUS_PEAKS:CAT_CAT + - CONSENSUS_PEAKS:SORT_BED + - CONSENSUS_PEAKS:BEDTOOLS_MERGE + - CONSENSUS_PEAKS:- CONSENSUS_PEAKS:_OUT + +## Annotate +- Processes include: + + - ANNOTATE:CHIPSEEKER_PEAKPLOT + - ANNOTATE:CHIPSEEKER_ANNOTATE + - ANNOTATE:CHIPSEEKER_PLOTLIST + - ANNOTATE:HOMER_MOTIFS + - ANNOTATE:MEME_AME + +## Differential Analysis +- If there are more than 2 replicates per group then `diffbind` is performed; otherwise `manorm` pairewise analysis is performed + +- Processes include: + + - DIFF:DIFFBIND:PREP_DIFFBIND + - DIFF:DIFFBIND:DIFFBIND_RMD + - DIFF:MANORM:MANORM_PAIRWISE \ No newline at end of file diff --git a/main.nf b/main.nf index 1549c54c..5c7b6b4d 100644 --- a/main.nf +++ b/main.nf @@ -36,6 +36,9 @@ include { PHANTOM_PEAKS PPQT_PROCESS MULTIQC } from "./modules/local/qc.nf" + +contrastsheet = params.contrastsheet ?: "/assets/contrast_test.ymls" + workflow.onComplete { if (!workflow.stubRun && !workflow.commandLine.contains('-preview')) { def message = Utils.spooker(workflow) @@ -64,7 +67,7 @@ workflow { } workflow CHIPSEQ { - INPUT_CHECK(file(params.input, checkIfExists: true), params.seq_center) + INPUT_CHECK(file(params.input, checkIfExists: true), params.seq_center, file(contrastsheet)) INPUT_CHECK.out.reads.set { raw_fastqs } raw_fastqs | CUTADAPT @@ -129,7 +132,6 @@ workflow CHIPSEQ { } .set{ ch_consensus_peaks } if (params.contrasts) { - contrasts = file(params.contrasts, checkIfExists: true) // TODO use consensus peaks for regions of interest in diffbind CALL_PEAKS.out.bam_peaks .combine(deduped_bam) @@ -145,8 +147,7 @@ workflow CHIPSEQ { .set{ tagalign_peaks } DIFF( bam_peaks, tagalign_peaks, - INPUT_CHECK.out.csv, - contrasts + INPUT_CHECK.out.contrasts ) } diff --git a/mkdocs.yml b/mkdocs.yml index cd9f0885..921a8b8d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -96,6 +96,8 @@ nav: - Home: index.md - Usage: - Getting Started: getting-started.md + - Creating Manifests: manifests.md + - Workflow Overview: workflow.md - Developer Guide: - How to contribute: contributing.md - Release guide: release-guide.md diff --git a/modules/local/chipseeker/annotate/main.nf b/modules/local/chipseeker/annotate/main.nf index 756304e7..82788116 100644 --- a/modules/local/chipseeker/annotate/main.nf +++ b/modules/local/chipseeker/annotate/main.nf @@ -1,5 +1,5 @@ process CHIPSEEKER_ANNOTATE { - tag "${meta.id}.${meta.group}" + tag "${meta.id}" label 'peaks' label 'process_high' @@ -11,15 +11,15 @@ process CHIPSEEKER_ANNOTATE { val(annot_db) output: - tuple val(meta), path("*.annotated.txt"), path("*.summary.txt"), path("*.genelist.txt"), emit: txt - path("*.annotation.Rds"), emit: annot - path("*.png"), emit: plots + tuple val(meta), path("${meta.id}/*.annotated.txt"), path("${meta.id}/*.summary.txt"), path("${meta.id}/*.genelist.txt"), emit: txt + path("${meta.id}/*.annotation.Rds"), emit: annot + path("${meta.id}/*.png"), emit: plots script: """ chipseeker_annotate.R \\ --peak ${bed} \\ - --outfile-prefix ${meta.id}.${meta.group} \\ + --outfile-prefix ${meta.id} \\ --genome-txdb ${txdb} \\ --genome-annot ${annot_db} \\ --uptss 2000 \\ @@ -32,7 +32,7 @@ process CHIPSEEKER_ANNOTATE { """ for ftype in annotated.txt summary.txt genelist.txt annotation.Rds .png do - touch ${meta.id}.${meta.group}.\${ftype} + touch ${meta.id}/${meta.id}.\${ftype} done """ } diff --git a/modules/local/chipseeker/peakplot/main.nf b/modules/local/chipseeker/peakplot/main.nf index 9c3eaa4e..eada5474 100644 --- a/modules/local/chipseeker/peakplot/main.nf +++ b/modules/local/chipseeker/peakplot/main.nf @@ -1,7 +1,7 @@ process CHIPSEEKER_PEAKPLOT { tag "${meta.id}" label 'peaks' - label 'process_medium' + label 'process_high' container 'nciccbr/ccbr_chipseeker:1.1.2' diff --git a/modules/local/deeptools.nf b/modules/local/deeptools.nf index 625b9118..5835cc7a 100644 --- a/modules/local/deeptools.nf +++ b/modules/local/deeptools.nf @@ -70,7 +70,7 @@ process NORMALIZE_INPUT { process BIGWIG_SUM { label 'qc' label 'deeptools' - label 'process_high' + label 'process_high_memory' container "${params.containers.deeptools}" @@ -307,7 +307,7 @@ process PLOT_HEATMAP { process PLOT_PROFILE { label 'qc' label 'deeptools' - label 'process_single' + label 'process_low' container "${params.containers.deeptools}" diff --git a/nextflow.config b/nextflow.config index 8b66688b..6c5829d4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -2,11 +2,14 @@ nextflow.enable.dsl = 2 params { input = null - contrasts = null + contrasts = false seq_center = null read_length = null genome = null + // manifest for contrasts + contrastsheet = null + // custom genome options genome_fasta = null genes_gtf = null diff --git a/subworkflows/local/differential/main.nf b/subworkflows/local/differential/main.nf index 4115e08a..7e386e9f 100644 --- a/subworkflows/local/differential/main.nf +++ b/subworkflows/local/differential/main.nf @@ -1,4 +1,3 @@ -include { CHECK_CONTRASTS } from "../../../modules/local/check_contrasts/" include { DIFFBIND } from "../../../subworkflows/local/diffbind/" include { MANORM } from "../../../subworkflows/local/manorm/" @@ -6,22 +5,9 @@ workflow DIFF { take: bam_peaks tagalign_peaks - samplesheet_file - contrasts_file + contrasts main: - - CHECK_CONTRASTS(samplesheet_file, contrasts_file) - .csv - .flatten() - .splitCsv( header: true, sep: ',' ) - .map{ it -> - meta = get_contrast_meta(it) - [ sample_basename: meta.sample_basename, group: meta.group, contrast: meta.contrast ] - } - .unique() - .set{ contrasts } - bam_peaks .map{ meta, bam, bai, peak, ctrl_bam, ctrl_bai -> [meta.sample_basename, meta.rep] } .groupTuple() @@ -71,18 +57,4 @@ workflow DIFF { diff_peaks = bam_peaks // TODO -} - -def get_contrast_meta(LinkedHashMap row) { - def meta = [:] - meta.id = row.sample - meta.sample_basename = row.sample_basename - meta.rep = row.rep - meta.single_end = row.single_end.toBoolean() - meta.antibody = row.antibody - meta.control = row.control - meta.group = row.group - meta.contrast = row.contrast - - return meta -} +} \ No newline at end of file diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 36602633..2d5fdb80 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -4,11 +4,13 @@ // include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check.nf' +include { CHECK_CONTRASTS } from "../../modules/local/check_contrasts/" workflow INPUT_CHECK { take: samplesheet // file: /path/to/samplesheet.csv seq_center // string: sequencing center for read group + contrastsheet // file: /path/to/contrast.yaml main: valid_csv = SAMPLESHEET_CHECK( samplesheet ).csv @@ -17,10 +19,26 @@ workflow INPUT_CHECK { .map { create_fastq_channel(it, seq_center) } .set { reads } + // Run check on the contrast manifest + contrasts=Channel.empty() + if (params.contrasts) { + CHECK_CONTRASTS(valid_csv, contrastsheet) + .csv + .flatten() + .splitCsv( header: true, sep: ',' ) + .map{ it -> + meta = get_contrast_meta(it) + [ sample_basename: meta.sample_basename, group: meta.group, contrast: meta.contrast ] + } + .unique() + .set{ contrasts } + } + emit: - reads // channel: [ val(meta), [ reads ] ] - csv = valid_csv - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] + reads = reads // channel: [ val(meta), [ reads ] ] + csv = valid_csv + contrasts = contrasts + versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } // Function to get list of [ meta, [ fastq_1, fastq_2 ] ] @@ -33,12 +51,6 @@ def create_fastq_channel(LinkedHashMap row, String seq_center) { meta.antibody = row.antibody meta.control = row.control - def read_group = "\'@RG\\tID:${meta.id}\\tSM:${meta.id.split('_')[0..-2].join('_')}\\tPL:ILLUMINA\\tLB:${meta.id}\\tPU:1\'" - if (seq_center) { - read_group = "\'@RG\\tID:${meta.id}\\tSM:${meta.id.split('_')[0..-2].join('_')}\\tPL:ILLUMINA\\tLB:${meta.id}\\tPU:1\\tCN:${seq_center}\'" - } - meta.read_group = read_group - // add path(s) of the fastq file(s) to the meta map def fastq_meta = [] if (!file(row.fastq_1).exists()) { @@ -54,3 +66,18 @@ def create_fastq_channel(LinkedHashMap row, String seq_center) { } return fastq_meta } + +// Function to get contrast list of [meta, [id, sample_basename, rep, single_end, antibody, control, group, contrast]] +def get_contrast_meta(LinkedHashMap row) { + def meta = [:] + meta.id = row.sample + meta.sample_basename = row.sample_basename + meta.rep = row.rep + meta.single_end = row.single_end.toBoolean() + meta.antibody = row.antibody + meta.control = row.control + meta.group = row.group + meta.contrast = row.contrast + + return meta +}