From 28fe888353bfe090f4162f75cff60ee1b01968bb Mon Sep 17 00:00:00 2001 From: Raquel Manzano Date: Mon, 21 Aug 2023 09:43:47 +0100 Subject: [PATCH] Updating alignment subworkflow and modules --- lib/WorkflowRnadnavar.groovy | 79 +++---- main.nf | 17 +- modules.json | 105 +++++++-- modules/nf-core/ensemblvep/Dockerfile | 31 --- modules/nf-core/ensemblvep/build.sh | 28 --- modules/nf-core/ensemblvep/environment.yml | 10 - modules/nf-core/ensemblvep/main.nf | 56 ----- modules/nf-core/ensemblvep/meta.yml | 73 ------ modules/nf-core/hisat2/align/meta.yml | 2 +- modules/nf-core/hisat2/build/meta.yml | 2 +- .../hisat2/extractsplicesites/meta.yml | 2 +- modules/nf-core/samtools/flagstat/meta.yml | 2 +- modules/nf-core/samtools/idxstats/meta.yml | 2 +- nextflow_schema.json | 42 +++- subworkflows/local/bam_align/main.nf | 207 ++++++++++++++++++ .../bam_convert_samtools/main.nf} | 16 +- .../local/bam_merge_index_samtools/main.nf | 6 +- .../local/channel_align_create_csv/main.nf | 23 ++ .../fastq_align_bwamem_mem2_dragmap/main.nf | 46 ++++ subworkflows/local/mapping.nf | 199 ----------------- subworkflows/local/mapping_csv.nf | 21 -- subworkflows/local/prepare_genome.nf | 189 ---------------- subworkflows/local/prepare_genome/main.nf | 189 ++++++++++++++++ .../main.nf} | 12 +- .../local/prepare_reference_and_intervals.nf | 22 +- subworkflows/nf-core/align_star.nf | 55 ----- subworkflows/nf-core/bam_sort_samtools.nf | 6 +- subworkflows/nf-core/merge_index_bam.nf | 45 ---- 28 files changed, 671 insertions(+), 816 deletions(-) delete mode 100644 modules/nf-core/ensemblvep/Dockerfile delete mode 100644 modules/nf-core/ensemblvep/build.sh delete mode 100644 modules/nf-core/ensemblvep/environment.yml delete mode 100644 modules/nf-core/ensemblvep/main.nf delete mode 100644 modules/nf-core/ensemblvep/meta.yml create mode 100644 subworkflows/local/bam_align/main.nf rename subworkflows/{nf-core/alignment_to_fastq.nf => local/bam_convert_samtools/main.nf} (87%) create mode 100644 subworkflows/local/channel_align_create_csv/main.nf create mode 100644 subworkflows/local/fastq_align_bwamem_mem2_dragmap/main.nf delete mode 100644 subworkflows/local/mapping.nf delete mode 100644 subworkflows/local/mapping_csv.nf delete mode 100644 subworkflows/local/prepare_genome.nf create mode 100644 subworkflows/local/prepare_genome/main.nf rename subworkflows/local/{prepare_intervals.nf => prepare_intervals/main.nf} (90%) delete mode 100644 subworkflows/nf-core/align_star.nf delete mode 100644 subworkflows/nf-core/merge_index_bam.nf diff --git a/lib/WorkflowRnadnavar.groovy b/lib/WorkflowRnadnavar.groovy index cbd3c30..a80e6ca 100755 --- a/lib/WorkflowRnadnavar.groovy +++ b/lib/WorkflowRnadnavar.groovy @@ -11,11 +11,9 @@ class WorkflowRnadnavar { // Check and validate parameters // public static void initialise(params, log) { - genomeExistsError(params, log) - - if (!params.fasta) { + if (!params.fasta && params.step == 'annotate') { Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." } } @@ -47,57 +45,15 @@ class WorkflowRnadnavar { return yaml_file_text } - // - // Generate methods description for MultiQC - // - - public static String toolCitationText(params) { - - // TODO Optionally add in-text citation tools to this list. - // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", - // Uncomment function in methodsDescriptionText to render in MultiQC report - def citation_text = [ - "Tools used in the workflow included:", - "FastQC (Andrews 2010),", - "MultiQC (Ewels et al. 2016)", - "." - ].join(' ').trim() - - return citation_text - } - - public static String toolBibliographyText(params) { - - // TODO Optionally add bibliographic entries to this list. - // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", - // Uncomment function in methodsDescriptionText to render in MultiQC report - def reference_text = [ - "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", - "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " - ].join(' ').trim() - - return reference_text - } - - public static String methodsDescriptionText(run_workflow, mqc_methods_yaml, params) { + public static String methodsDescriptionText(run_workflow, mqc_methods_yaml) { // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file def meta = [:] meta.workflow = run_workflow.toMap() meta["manifest_map"] = run_workflow.manifest.toMap() - // Pipeline DOI meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " - // Tool references - meta["tool_citations"] = "" - meta["tool_bibliography"] = "" - - // TODO Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! - //meta["tool_citations"] = toolCitationText(params).replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") - //meta["tool_bibliography"] = toolBibliographyText(params) - - def methods_text = mqc_methods_yaml.text def engine = new SimpleTemplateEngine() @@ -119,4 +75,33 @@ class WorkflowRnadnavar { Nextflow.error(error_string) } } -} + // TODO add consensus and filtering steps here + public static String retrieveInput(params, log){ + def input = '' + if (params.input) input = params.input + else { + switch (params.step) { + case 'mapping': Nextflow.error("Can't start with step $params.step without samplesheet") + break + case 'markduplicates': log.warn("Using file ${params.outdir}/csv/mapped.csv"); + input = params.outdir + "/csv/mapped.csv" + break + case 'prepare_recalibration': log.warn("Using file ${params.outdir}/csv/markduplicates_no_table.csv"); + input = params.outdir + "/csv/markduplicates_no_table.csv" + break + case 'recalibrate': log.warn("Using file ${params.outdir}/csv/markduplicates.csv"); + input = params.outdir + "/csv/markduplicates.csv" + break + case 'variant_calling': log.warn("Using file ${params.outdir}/csv/recalibrated.csv"); + input = params.outdir + "/csv/recalibrated.csv" + break + case 'annotate': log.warn("Using file ${params.outdir}/csv/variantcalled.csv"); + input = params.outdir + "/csv/variantcalled.csv" + break + default: log.warn("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") + Nextflow.error("Unknown step $params.step") + } + } + return input + } +} \ No newline at end of file diff --git a/main.nf b/main.nf index 621b548..fa73e1d 100644 --- a/main.nf +++ b/main.nf @@ -1,4 +1,5 @@ #!/usr/bin/env nextflow + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nf-core/rnadnavar @@ -17,7 +18,9 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') +params.bwa = WorkflowMain.getGenomeAttribute(params, 'bwa') +params.bwamem2 = WorkflowMain.getGenomeAttribute(params, 'bwamem2') +params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') params.fasta_fai = WorkflowMain.getGenomeAttribute(params, 'fasta_fai') params.dict = WorkflowMain.getGenomeAttribute(params, 'dict') params.gtf = WorkflowMain.getGenomeAttribute(params, 'gtf') @@ -33,6 +36,15 @@ params.vep_genome = WorkflowMain.getGenomeAttribute(params, 'vep_genom params.vep_species = WorkflowMain.getGenomeAttribute(params, 'vep_species') +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ALTERNATIVE INPUT FILE ON RESTART +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params.input_restart = WorkflowRnadnavar.retrieveInput(params, log) + + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ VALIDATE & PRINT PARAMETER SUMMARY @@ -45,7 +57,7 @@ include { validateParameters; paramsHelp } from 'plugin/nf-validation' if (params.help) { def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' - def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" + def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GATK.GRCh38 -profile docker --outdir results" log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs) System.exit(0) } @@ -81,7 +93,6 @@ workflow NFCORE_RNADNAVAR { // // WORKFLOW: Execute a single named workflow for the pipeline // See: https://github.com/nf-core/rnaseq/issues/619 -// workflow { NFCORE_RNADNAVAR () } diff --git a/modules.json b/modules.json index f698e63..bf5dfad 100644 --- a/modules.json +++ b/modules.json @@ -23,7 +23,7 @@ "bwa/mem": { "branch": "master", "git_sha": "3dc300ddcaa563c1e3503477557c0e0def6df2ce", - "installed_by": ["modules"] + "installed_by": ["modules", "fastq_align_bwa"] }, "bwamem2/index": { "branch": "master", @@ -50,15 +50,30 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "dragmap/align": { + "branch": "master", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "installed_by": ["modules"] + }, + "dragmap/hashtable": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "ensemblvep": { "branch": "master", "git_sha": "29984d70aea47d06f0062a1785d76c357dd40ea9", "installed_by": ["modules"] }, + "ensemblvep/download": { + "branch": "master", + "git_sha": "9f9e1fc31cb35876922070c0e601ae05abae5cae", + "installed_by": ["modules"] + }, "ensemblvep/vep": { "branch": "master", "git_sha": "9f9e1fc31cb35876922070c0e601ae05abae5cae", - "installed_by": ["vcf_annotate_ensemblvep"] + "installed_by": ["vcf_annotate_ensemblvep", "modules"] }, "fastp": { "branch": "master", @@ -108,7 +123,7 @@ "gatk4/calculatecontamination": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": ["bam_tumor_normal_somatic_variant_calling_gatk", "modules"] }, "gatk4/createsequencedictionary": { "branch": "master", @@ -123,7 +138,7 @@ "gatk4/filtermutectcalls": { "branch": "master", "git_sha": "2df2a11d5b12f2a73bca74f103691bc35d83c5fd", - "installed_by": ["modules"] + "installed_by": ["bam_tumor_normal_somatic_variant_calling_gatk", "modules"] }, "gatk4/filtervarianttranches": { "branch": "master", @@ -153,7 +168,7 @@ "gatk4/getpileupsummaries": { "branch": "master", "git_sha": "2df2a11d5b12f2a73bca74f103691bc35d83c5fd", - "installed_by": ["modules"] + "installed_by": ["bam_tumor_normal_somatic_variant_calling_gatk", "modules"] }, "gatk4/indexfeaturefile": { "branch": "master", @@ -173,7 +188,7 @@ "gatk4/learnreadorientationmodel": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": ["bam_tumor_normal_somatic_variant_calling_gatk", "modules"] }, "gatk4/markduplicates": { "branch": "master", @@ -198,7 +213,7 @@ "gatk4/mutect2": { "branch": "master", "git_sha": "2df2a11d5b12f2a73bca74f103691bc35d83c5fd", - "installed_by": ["modules"] + "installed_by": ["bam_tumor_normal_somatic_variant_calling_gatk", "modules"] }, "gatk4/splitncigarreads": { "branch": "master", @@ -228,7 +243,7 @@ "hisat2/align": { "branch": "master", "git_sha": "a1881f6374506f9e031b7af814768cdb44a6a7d3", - "installed_by": ["modules"] + "installed_by": ["fastq_align_hisat2", "modules"] }, "hisat2/build": { "branch": "master", @@ -255,11 +270,31 @@ "git_sha": "a6e11ac655e744f7ebc724be669dd568ffdc0e80", "installed_by": ["modules"] }, + "picard/collecthsmetrics": { + "branch": "master", + "git_sha": "0ce3ab0ac301f160225b22254fa238478b4389f2", + "installed_by": ["bam_qc_picard"] + }, + "picard/collectmultiplemetrics": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["bam_qc_picard"] + }, + "picard/collectwgsmetrics": { + "branch": "master", + "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "installed_by": ["bam_qc_picard"] + }, "picard/filtersamreads": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "picard/markduplicates": { + "branch": "master", + "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "installed_by": ["bam_markduplicates_picard"] + }, "samblaster": { "branch": "master", "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", @@ -288,17 +323,17 @@ "samtools/flagstat": { "branch": "master", "git_sha": "570ec5bcfe19c49e16c9ca35a7a116563af6cc1c", - "installed_by": ["modules"] + "installed_by": ["bam_stats_samtools", "modules"] }, "samtools/idxstats": { "branch": "master", "git_sha": "e662ab16e0c11f1e62983e21de9871f59371a639", - "installed_by": ["modules"] + "installed_by": ["bam_stats_samtools", "modules"] }, "samtools/index": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": ["bam_markduplicates_picard", "modules", "bam_sort_stats_samtools"] }, "samtools/merge": { "branch": "master", @@ -313,12 +348,12 @@ "samtools/sort": { "branch": "master", "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9", - "installed_by": ["modules"] + "installed_by": ["bam_sort_stats_samtools", "modules"] }, "samtools/stats": { "branch": "master", "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", - "installed_by": ["modules"] + "installed_by": ["bam_stats_samtools", "modules"] }, "samtools/view": { "branch": "master", @@ -328,7 +363,7 @@ "star/align": { "branch": "master", "git_sha": "57d75dbac06812c59798a48585032f6e50bb1914", - "installed_by": ["modules"] + "installed_by": ["fastq_align_star", "modules"] }, "star/genomegenerate": { "branch": "master", @@ -348,7 +383,7 @@ "tabix/tabix": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules", "vcf_annotate_ensemblvep"] + "installed_by": ["vcf_annotate_ensemblvep", "modules"] }, "untar": { "branch": "master", @@ -369,6 +404,46 @@ }, "subworkflows": { "nf-core": { + "bam_markduplicates_picard": { + "branch": "master", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", + "installed_by": ["subworkflows"] + }, + "bam_qc_picard": { + "branch": "master", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", + "installed_by": ["subworkflows"] + }, + "bam_sort_stats_samtools": { + "branch": "master", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", + "installed_by": ["fastq_align_bwa", "fastq_align_star", "fastq_align_hisat2"] + }, + "bam_stats_samtools": { + "branch": "master", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", + "installed_by": ["bam_markduplicates_picard", "bam_sort_stats_samtools"] + }, + "bam_tumor_normal_somatic_variant_calling_gatk": { + "branch": "master", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", + "installed_by": ["subworkflows"] + }, + "fastq_align_bwa": { + "branch": "master", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", + "installed_by": ["subworkflows"] + }, + "fastq_align_hisat2": { + "branch": "master", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", + "installed_by": ["subworkflows"] + }, + "fastq_align_star": { + "branch": "master", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", + "installed_by": ["subworkflows"] + }, "vcf_annotate_ensemblvep": { "branch": "master", "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", diff --git a/modules/nf-core/ensemblvep/Dockerfile b/modules/nf-core/ensemblvep/Dockerfile deleted file mode 100644 index 7d2c99c..0000000 --- a/modules/nf-core/ensemblvep/Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM nfcore/base:1.14 -LABEL \ - author="Maxime Garcia" \ - description="VEP image for nf-core pipelines" \ - maintainer="maxime.garcia@scilifelab.se" - -# Install the conda environment -COPY environment.yml / -RUN conda env create -f /environment.yml && conda clean -a - -# Setup default ARG variables -ARG GENOME=GRCh38 -ARG SPECIES=homo_sapiens -ARG VEP_CACHE_VERSION=106 -ARG VEP_VERSION=106.1 - -# Add conda installation dir to PATH (instead of doing 'conda activate') -ENV PATH /opt/conda/envs/nf-core-vep-${VEP_VERSION}/bin:$PATH - -# Download Genome -RUN vep_install \ - -a c \ - -c .vep \ - -s ${SPECIES} \ - -y ${GENOME} \ - --CACHE_VERSION ${VEP_CACHE_VERSION} \ - --CONVERT \ - --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE - -# Dump the details of the installed packages to a file for posterity -RUN conda env export --name nf-core-vep-${VEP_VERSION} > nf-core-vep-${VEP_VERSION}.yml diff --git a/modules/nf-core/ensemblvep/build.sh b/modules/nf-core/ensemblvep/build.sh deleted file mode 100644 index eaa3ed5..0000000 --- a/modules/nf-core/ensemblvep/build.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# Build and push all containers - -build_push() { - GENOME=$1 - SPECIES=$2 - VEP_CACHE_VERSION=$3 - VEP_VERSION=$4 - - docker build \ - . \ - -t nfcore/vep:${VEP_VERSION}.${GENOME} \ - --build-arg GENOME=${GENOME} \ - --build-arg SPECIES=${SPECIES} \ - --build-arg VEP_CACHE_VERSION=${VEP_CACHE_VERSION} \ - --build-arg VEP_VERSION=${VEP_VERSION} - - docker push nfcore/vep:${VEP_VERSION}.${GENOME} -} - -build_push "GRCh37" "homo_sapiens" "106" "106.1" -build_push "GRCh38" "homo_sapiens" "106" "106.1" -build_push "GRCm38" "mus_musculus" "102" "106.1" -build_push "GRCm39" "mus_musculus" "106" "106.1" -build_push "CanFam3.1" "canis_lupus_familiaris" "104" "106.1" -build_push "WBcel235" "caenorhabditis_elegans" "106" "106.1" diff --git a/modules/nf-core/ensemblvep/environment.yml b/modules/nf-core/ensemblvep/environment.yml deleted file mode 100644 index d378f81..0000000 --- a/modules/nf-core/ensemblvep/environment.yml +++ /dev/null @@ -1,10 +0,0 @@ -# You can use this file to create a conda environment for this module: -# conda env create -f environment.yml -name: nf-core-vep-106.1 -channels: - - conda-forge - - bioconda - - defaults - -dependencies: - - bioconda::ensembl-vep=106.1 diff --git a/modules/nf-core/ensemblvep/main.nf b/modules/nf-core/ensemblvep/main.nf deleted file mode 100644 index 6e78546..0000000 --- a/modules/nf-core/ensemblvep/main.nf +++ /dev/null @@ -1,56 +0,0 @@ -process ENSEMBLVEP { - tag "$meta.id" - label 'process_medium' - - conda (params.enable_conda ? "bioconda::ensembl-vep=107.0" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ensembl-vep:107.0--pl5321h4a94de4_0' : - 'quay.io/biocontainers/ensembl-vep:107.0--pl5321h4a94de4_0' }" - - input: - tuple val(meta), path(vcf) - val genome - val species - val cache_version - path cache - path fasta - path extra_files - - output: - tuple val(meta), path("*.ann.vcf") , optional:true, emit: vcf - tuple val(meta), path("*.ann.tab") , optional:true, emit: tab - tuple val(meta), path("*.ann.json") , optional:true, emit: json - path "*.summary.html" , emit: report - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def file_extension = args.contains("--vcf") ? 'vcf' : args.contains("--json")? 'json' : args.contains("--tab")? 'tab' : 'vcf' - def prefix = task.ext.prefix ?: "${meta.id}" - def dir_cache = cache ? "\${PWD}/${cache}" : "/.vep" - def reference = fasta ? "--fasta $fasta" : "" - - """ - vep \\ - -i $vcf \\ - -o ${prefix}.ann.${file_extension} \\ - $args \\ - $reference \\ - --assembly $genome \\ - --species $species \\ - --cache \\ - --cache_version $cache_version \\ - --dir_cache $dir_cache \\ - --fork $task.cpus \\ - --stats_file ${prefix}.summary.html \\ - - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/ensemblvep/meta.yml b/modules/nf-core/ensemblvep/meta.yml deleted file mode 100644 index a4dde8a..0000000 --- a/modules/nf-core/ensemblvep/meta.yml +++ /dev/null @@ -1,73 +0,0 @@ -name: ENSEMBLVEP -description: Ensembl Variant Effect Predictor (VEP). The output-file-format is controlled through `task.ext.args`. -keywords: - - annotation -tools: - - ensemblvep: - description: | - VEP determines the effect of your variants (SNPs, insertions, deletions, CNVs - or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions. - homepage: https://www.ensembl.org/info/docs/tools/vep/index.html - documentation: https://www.ensembl.org/info/docs/tools/vep/script/index.html - licence: ["Apache-2.0"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - vcf: - type: file - description: | - vcf to annotate - - genome: - type: value - description: | - which genome to annotate with - - species: - type: value - description: | - which species to annotate with - - cache_version: - type: value - description: | - which version of the cache to annotate with - - cache: - type: file - description: | - path to VEP cache (optional) - - fasta: - type: file - description: | - reference FASTA file (optional) - pattern: "*.{fasta,fa}" - - extra_files: - type: tuple - description: | - path to file(s) needed for plugins (optional) -output: - - vcf: - type: file - description: | - annotated vcf (optional) - pattern: "*.ann.vcf" - - tab: - type: file - description: | - tab file with annotated variants (optional) - pattern: "*.ann.tab" - - json: - type: file - description: | - json file with annotated variants (optional) - pattern: "*.ann.json" - - report: - type: file - description: VEP report file - pattern: "*.html" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@maxulysse" diff --git a/modules/nf-core/hisat2/align/meta.yml b/modules/nf-core/hisat2/align/meta.yml index 008a961..001e5d8 100644 --- a/modules/nf-core/hisat2/align/meta.yml +++ b/modules/nf-core/hisat2/align/meta.yml @@ -8,7 +8,7 @@ keywords: tools: - hisat2: - description: HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome. + description: HISAT2 is a fast and sensitive alignment program for bam_align next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome. homepage: https://daehwankimlab.github.io/hisat2/ documentation: https://daehwankimlab.github.io/hisat2/manual/ doi: "10.1038/s41587-019-0201-4" diff --git a/modules/nf-core/hisat2/build/meta.yml b/modules/nf-core/hisat2/build/meta.yml index e61bf2a..854732f 100644 --- a/modules/nf-core/hisat2/build/meta.yml +++ b/modules/nf-core/hisat2/build/meta.yml @@ -8,7 +8,7 @@ keywords: - reference tools: - hisat2: - description: HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome. + description: HISAT2 is a fast and sensitive alignment program for bam_align next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome. homepage: https://daehwankimlab.github.io/hisat2/ documentation: https://daehwankimlab.github.io/hisat2/manual/ doi: "10.1038/s41587-019-0201-4" diff --git a/modules/nf-core/hisat2/extractsplicesites/meta.yml b/modules/nf-core/hisat2/extractsplicesites/meta.yml index f70de08..756e98e 100644 --- a/modules/nf-core/hisat2/extractsplicesites/meta.yml +++ b/modules/nf-core/hisat2/extractsplicesites/meta.yml @@ -8,7 +8,7 @@ keywords: tools: - hisat2: - description: HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome. + description: HISAT2 is a fast and sensitive alignment program for bam_align next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome. homepage: https://daehwankimlab.github.io/hisat2/ documentation: https://daehwankimlab.github.io/hisat2/manual/ doi: "10.1038/s41587-019-0201-4" diff --git a/modules/nf-core/samtools/flagstat/meta.yml b/modules/nf-core/samtools/flagstat/meta.yml index 954225d..adc7f53 100644 --- a/modules/nf-core/samtools/flagstat/meta.yml +++ b/modules/nf-core/samtools/flagstat/meta.yml @@ -2,7 +2,7 @@ name: samtools_flagstat description: Counts the number of alignments in a BAM/CRAM/SAM file for each FLAG type keywords: - stats - - mapping + - bam_align - counts - bam - sam diff --git a/modules/nf-core/samtools/idxstats/meta.yml b/modules/nf-core/samtools/idxstats/meta.yml index dda87e1..a258c1b 100644 --- a/modules/nf-core/samtools/idxstats/meta.yml +++ b/modules/nf-core/samtools/idxstats/meta.yml @@ -2,7 +2,7 @@ name: samtools_idxstats description: Reports alignment summary statistics for a BAM/CRAM/SAM file keywords: - stats - - mapping + - bam_align - counts - chromosome - bam diff --git a/nextflow_schema.json b/nextflow_schema.json index 2de8d4e..4cec090 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -16,14 +16,32 @@ ], "properties": { "input": { + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "A design file with information about the samples in your experiment. Use this parameter to specify the location of the input files. It has to be a comma-separated file with a header row. See [usage docs](https://nf-co.re/sarek/usage#input).\n\nIf no input file is specified, sarek will attempt to locate one in the `{outdir}` directory. If no input should be supplied, i.e. when --step is supplied or --build_from_index, then set --input false", + "fa_icon": "fas fa-file-csv", + "schema": "assets/schema_input.json", + "anyOf": [ + { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$" + }, + { + "type": "boolean", + "enum": ["false"] + } + ] + }, + "input_restart": { "type": "string", "format": "file-path", "exists": true, "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/rnadnavar/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" + "hidden": true, + "schema": "assets/schema_input.json" }, "split_fastq": { "type": "integer", @@ -110,6 +128,13 @@ "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nIf you wish to recompute indices available on igenomes, set `--bwamem2 false`.\n\n> **NB** If none provided, will be generated automatically from the FASTA reference, if `--aligner bwa-mem2` is specified. Combine with `--save_reference` to save for future runs.", "hidden": true }, + "dragmap": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "Path to dragmap indices.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nIf you wish to recompute indices available on igenomes, set `--dragmap false`.\n\n> **NB** If none provided, will be generated automatically from the FASTA reference, if `--aligner dragmap` is specified. Combine with `--save_reference` to save for future runs.", + "hidden": true + }, "star_index": { "type": "string", "description": "Path to STAR index folder or compressed file (tar.gz)", @@ -424,7 +449,7 @@ "fa_icon": "fas fa-forward", "description": "Disable specified tools.", "help_text": "Multiple tools can be specified, separated by commas.\n\n> **NB** `--skip_tools baserecalibrator_report` is actually just not saving the reports.\n> **NB** `--skip_tools markduplicates_report` does not skip `MarkDuplicates` but prevent the collection of duplicate metrics that slows down performance.", - "pattern": "^((contamination|learnreadorientation|baserecalibrator|baserecalibrator_report|bcftools|documentation|fastqc|markduplicates|markduplicates_report|mosdepth|multiqc|samtools|vcftools|versions|splitncigar)*(,)*)*$" + "pattern": "^((contamination|learnreadorientation|baserecalibrator|baserecalibrator_report|bcftools|documentation|fastqc|markduplicates|markduplicates_report|mosdepth|multiqc|samtools|vcftools|versions|splitncigar|second_pass)*(,)*)*$" }, "wes": { "type": "boolean", @@ -442,9 +467,12 @@ "properties": { "aligner": { "type": "string", - "default": "star", - "description": "Specifies the alignment algorithm to use. Currently available option is 'star'", - "help_text": "This parameter define which aligner is to be used for aligning the RNA reads to the reference genome. Currently only STAR aligner is supported. So use 'star' as the value for this option." + "default": "bwa-mem", + "fa_icon": "fas fa-puzzle-piece", + "enum": ["bwa-mem", "bwa-mem2", "dragmap"], + "description": "Specify aligner to be used to map reads to reference genome.", + "help_text": "`Rnadnavar` will build missing indices automatically if not provided. Set `--bwa false` if indices should be (re-)built.\nIf `DragMap` is selected as aligner, it is recommended to skip baserecalibration with `--skip_tools baserecalibrator`. See [here](https://gatk.broadinstitute.org/hc/en-us/articles/4407897446939--How-to-Run-germline-single-sample-short-variant-discovery-in-DRAGEN-mode) for more info.\n", + "hidden": true }, "star_index": { "type": "string", diff --git a/subworkflows/local/bam_align/main.nf b/subworkflows/local/bam_align/main.nf new file mode 100644 index 0000000..9d77841 --- /dev/null +++ b/subworkflows/local/bam_align/main.nf @@ -0,0 +1,207 @@ +// +// DNA and RNA ALIGNMENT +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +// SUBWORKFLOWS +// Convert BAM files to FASTQ files +include { BAM_CONVERT_SAMTOOLS as CONVERT_FASTQ_INPUT } from '../bam_convert_samtools/main' +// Map input reads to reference genome in DNA +include { FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP } from '../fastq_align_bwamem_mem2_dragmap/main' +// Map input reads to reference genome in RNA +include { FASTQ_ALIGN_STAR } from '../../nf-core/fastq_align_star/main' +// Merge and index BAM files (optional) +include { BAM_MERGE_INDEX_SAMTOOLS } from '../bam_merge_index_samtools/main' +// Create samplesheets to restart from mapping +include { CHANNEL_ALIGN_CREATE_CSV } from '../channel_align_create_csv/main' + +// MODULES +// Run FASTQC +include { FASTQC } from '../../../modules/nf-core/fastqc/main' +// TRIM/SPLIT FASTQ Files +include { FASTP } from '../../../modules/nf-core/fastp/main' + + +workflow BAM_ALIGN { + + take: + bwa + bwamem2 + dragmap + star_index + gtf + input_sample + + main: + reports = Channel.empty() + versions = Channel.empty() + + // Gather index for mapping given the chosen aligner for DNA + index_alignement = params.aligner == "bwa-mem" ? bwa : + params.aligner == "bwa-mem2" ? bwamem2 : + dragmap + if (params.step == 'mapping') { + + // Figure out if input is bam or fastq + input_sample_type = input_sample.branch{ + bam: it[0].data_type == "bam" + fastq: it[0].data_type == "fastq" + } + + // convert any bam input to fastq + // fasta are not needed when converting bam to fastq -> [ id:"fasta" ], [] + // No need for fasta.fai -> [] + interleave_input = false // Currently don't allow interleaved input + CONVERT_FASTQ_INPUT( + input_sample_type.bam, + [ [ id:"fasta" ], [] ], // fasta + [ [ id:'null' ], [] ], // fasta_fai + interleave_input) + + // Gather fastq (inputed or converted) + // Theorically this could work on mixed input (fastq for one sample and bam for another) + // But not sure how to handle that with the samplesheet + // Or if we really want users to be able to do that + input_fastq = input_sample_type.fastq.mix(CONVERT_FASTQ_INPUT.out.reads) + + + + // STEP 1.B: QC + if (!(params.skip_tools && params.skip_tools.split(',').contains('fastqc'))) { + FASTQC(input_fastq) + + reports = reports.mix(FASTQC.out.zip.collect{ meta, logs -> logs }) + versions = versions.mix(FASTQC.out.versions.first()) + } + + + + // STEP 1.C: Trimming and/or splitting + if (params.trim_fastq || params.split_fastq > 0) { + + save_trimmed_fail = false + save_merged = false + FASTP( + input_fastq, + [], // we are not using any adapter fastas at the moment + save_trimmed_fail, + save_merged + ) + + reports = reports.mix(FASTP.out.json.collect{ meta, json -> json }) + reports = reports.mix(FASTP.out.html.collect{ meta, html -> html }) + + if (params.split_fastq) { + reads_for_alignment = FASTP.out.reads.map{ meta, reads -> + read_files = reads.sort(false) { a,b -> a.getName().tokenize('.')[0] <=> b.getName().tokenize('.')[0] }.collate(2) + [ meta + [ size:read_files.size() ], read_files ] + }.transpose() + } else reads_for_alignment = FASTP.out.reads + + versions = versions.mix(FASTP.out.versions) + + } else { + reads_for_alignment = input_fastq + } + + // STEP 1.D: MAPPING READS TO REFERENCE GENOME + // Generate mapped reads channel for alignment + // reads will be sorted + reads_for_alignment = reads_for_alignment.map{ meta, reads -> + // Update meta.id to meta.sample no multiple lanes or splitted fastqs + if (meta.size * meta.num_lanes == 1) [ meta + [ id:meta.sample ], reads ] + else [ meta, reads ] + } + // Separate DNA from RNA samples, DNA samples will be aligned with bwa, and RNA samples with star + reads_for_alignment_status = reads_for_alignment.branch{ + dna: it[0].status < 2 + rna: it[0].status == 2 + } + + // STEP 1.D.1: DNA mapping with BWA + sort_bam = true + FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP(reads_for_alignment_status.dna, index_alignement, sort_bam) + + // Grouping the bams from the same samples not to stall the workflow + bam_mapped_dna = FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP.out.bam.map{ meta, bam -> + + // Update meta.id to be meta.sample, ditching sample-lane that is not needed anymore + // Update meta.data_type + // Remove no longer necessary fields: + // read_group: Now in the BAM header + // num_lanes: only needed for mapping + // size: only needed for mapping + + // Use groupKey to make sure that the correct group can advance as soon as it is complete + // and not stall the workflow until all reads from all channels are mapped + [ groupKey( meta - meta.subMap('num_lanes', 'read_group', 'size') + [ data_type:'bam', id:meta.sample ], (meta.num_lanes ?: 1) * (meta.size ?: 1)), bam ] + }.groupTuple() + bam_mapped_dna,dump(tag:"bam_mapped_dna") + + // RNA will be aligned with STAR + // Run STAR + ALIGN_STAR ( + ch_reads_to_map_status.rna, + star_index, + gtf, + params.star_ignore_sjdbgtf, + params.seq_platform ? params.seq_platform : [], + params.seq_center ? params.seq_center : [], + [ [ id:"fasta" ], [] ] // fasta + ) + // Grouping the bams from the same samples not to stall the workflow + bam_mapped_rna = ALIGN_STAR.out.bam.map{ meta, bam -> + + // Update meta.id to be meta.sample, ditching sample-lane that is not needed anymore + // Update meta.data_type + // Remove no longer necessary fields: + // read_group: Now in the BAM header + // num_lanes: only needed for mapping + // size: only needed for mapping + + // Use groupKey to make sure that the correct group can advance as soon as it is complete + // and not stall the workflow until all reads from all channels are mapped + [ groupKey( meta - meta.subMap('num_lanes', 'read_group', 'size') + [ data_type:'bam', id:meta.sample ], (meta.num_lanes ?: 1) * (meta.size ?: 1)), bam ] + }.groupTuple() + bam_mapped_rna,dump(tag:"bam_mapped_rna") + // Gather QC reports + reports = reports.mix(ALIGN_STAR.out.stats.collect{it[1]}.ifEmpty([])) + reports = reports.mix(ALIGN_STAR.out.log_final.collect{it[1]}.ifEmpty([])) + versions = versions.mix(ALIGN_STAR.out.versions) + + // mix dna and rna in one channel + bam_mapped = bam_mapped_dna.mix(bam_mapped_rna) + + // gatk4 markduplicates can handle multiple bams as input, so no need to merge/index here + // Except if and only if skipping markduplicates or saving mapped bams + if (params.save_mapped || (params.skip_tools && params.skip_tools.split(',').contains('markduplicates'))) { + + // bams are merged (when multiple lanes from the same sample), indexed and then converted to cram + BAM_MERGE_INDEX_SAMTOOLS(bam_mapped) + + BAM_TO_CRAM_MAPPING(BAM_MERGE_INDEX_SAMTOOLS.out.bam_bai, fasta, fasta_fai) + // Create CSV to restart from this step + params.save_output_as_bam ? CHANNEL_ALIGN_CREATE_CSV(BAM_MERGE_INDEX_SAMTOOLS.out.bam_bai) : CHANNEL_ALIGN_CREATE_CSV(BAM_TO_CRAM_MAPPING.out.alignment_index) + + // Gather used softwares versions + versions = versions.mix(BAM_MERGE_INDEX_SAMTOOLS.out.versions) + versions = versions.mix(BAM_TO_CRAM_MAPPING.out.versions) + } + + // Gather used softwares versions + versions = versions.mix(CONVERT_FASTQ_INPUT.out.versions) + versions = versions.mix(FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP.out.versions) + versions = versions.mix(ALIGN_STAR.out.versions) + + } + + emit: + // TODO: do I need to output RNA and DNA separately or cam I directly use bam_mapped but separating them? + bam_mapped_rna = bam_mapped_rna //second pass with RG tags + bam_mapped_dna = bam_mapped_dna // second pass with RG tags + bam_mapped = bam_mapped // for preprocessing + reports = reports + versions = versions + +} \ No newline at end of file diff --git a/subworkflows/nf-core/alignment_to_fastq.nf b/subworkflows/local/bam_convert_samtools/main.nf similarity index 87% rename from subworkflows/nf-core/alignment_to_fastq.nf rename to subworkflows/local/bam_convert_samtools/main.nf index b9e4341..ed1f659 100644 --- a/subworkflows/nf-core/alignment_to_fastq.nf +++ b/subworkflows/local/bam_convert_samtools/main.nf @@ -2,14 +2,14 @@ // BAM/CRAM to FASTQ conversion, paired end only // -include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_MAP_MAP } from '../../modules/nf-core/modules/samtools/view/main' -include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_UNMAP_UNMAP } from '../../modules/nf-core/modules/samtools/view/main' -include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_UNMAP_MAP } from '../../modules/nf-core/modules/samtools/view/main' -include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_MAP_UNMAP } from '../../modules/nf-core/modules/samtools/view/main' -include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_UNMAP } from '../../modules/nf-core/modules/samtools/merge/main' -include { SAMTOOLS_COLLATEFASTQ as COLLATE_FASTQ_UNMAP } from '../../modules/nf-core/modules/samtools/collatefastq/main' -include { SAMTOOLS_COLLATEFASTQ as COLLATE_FASTQ_MAP } from '../../modules/nf-core/modules/samtools/collatefastq/main' -include { CAT_FASTQ } from '../../modules/nf-core/modules/cat/fastq/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_MAP_MAP } from '../../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_UNMAP_UNMAP } from '../../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_UNMAP_MAP } from '../../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_MAP_UNMAP } from '../../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_UNMAP } from '../../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_COLLATEFASTQ as COLLATE_FASTQ_UNMAP } from '../../../modules/nf-core/samtools/collatefastq/main' +include { SAMTOOLS_COLLATEFASTQ as COLLATE_FASTQ_MAP } from '../../../modules/nf-core/samtools/collatefastq/main' +include { CAT_FASTQ } from '../../../modules/nf-core/cat/fastq/main' workflow BAM_CONVERT_SAMTOOLS { take: diff --git a/subworkflows/local/bam_merge_index_samtools/main.nf b/subworkflows/local/bam_merge_index_samtools/main.nf index 6aa5d93..9e8735f 100644 --- a/subworkflows/local/bam_merge_index_samtools/main.nf +++ b/subworkflows/local/bam_merge_index_samtools/main.nf @@ -4,8 +4,8 @@ // For all modules here: // A when clause condition is defined in the conf/modules.config to determine if the module should be run -include { SAMTOOLS_INDEX as INDEX_MERGE_BAM } from '../../../modules/nf-core/modules/samtools/index/main' -include { SAMTOOLS_MERGE as MERGE_BAM } from '../../../modules/nf-core/modules/samtools/merge/main' +include { SAMTOOLS_INDEX as INDEX_MERGE_BAM } from '../../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_MERGE as MERGE_BAM } from '../../../modules/nf-core/samtools/merge/main' workflow BAM_MERGE_INDEX_SAMTOOLS { take: @@ -15,14 +15,12 @@ workflow BAM_MERGE_INDEX_SAMTOOLS { versions = Channel.empty() // Figuring out if there is one or more bam(s) from the same sample - bam.dump(tag:"bam") bam_to_merge = bam.branch{ meta, bam -> // bam is a list, so use bam.size() to asses number of intervals single: bam.size() <= 1 return [ meta, bam[0] ] multiple: bam.size() > 1 } - bam_to_merge.dump(tag:"bam_to_merge") // Only when using intervals MERGE_BAM(bam_to_merge.multiple, [ [ id:'null' ], []], [ [ id:'null' ], []]) diff --git a/subworkflows/local/channel_align_create_csv/main.nf b/subworkflows/local/channel_align_create_csv/main.nf new file mode 100644 index 0000000..9ab83f5 --- /dev/null +++ b/subworkflows/local/channel_align_create_csv/main.nf @@ -0,0 +1,23 @@ +// +// CHANNEL_ALIGN_CREATE_CSV +// + +workflow CHANNEL_ALIGN_CREATE_CSV { + take: + bam_indexed // channel: [mandatory] meta, bam, bai + + main: + // Creating csv files to restart from this step + bam_indexed.collectFile(keepHeader: true, skip: 1, sort: true, storeDir: "${params.outdir}/csv") { meta, bam, bai -> + patient = meta.patient + sample = meta.sample + status = meta.status + bam = "${params.outdir}/preprocessing/mapped/${sample}/${bam.name}" + bai = "${params.outdir}/preprocessing/mapped/${sample}/${bai.name}" + + type = params.save_output_as_bam ? "bam" : "cram" + type_index = params.save_output_as_bam ? "bai" : "crai" + + ["mapped.csv", "patient,sex,status,sample,${type},${type_index}\n${patient},${status},${sample},${bam},${bai}\n"] + } +} \ No newline at end of file diff --git a/subworkflows/local/fastq_align_bwamem_mem2_dragmap/main.nf b/subworkflows/local/fastq_align_bwamem_mem2_dragmap/main.nf new file mode 100644 index 0000000..d7ee78e --- /dev/null +++ b/subworkflows/local/fastq_align_bwamem_mem2_dragmap/main.nf @@ -0,0 +1,46 @@ +// +// MAPPING +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { BWAMEM2_MEM } from '../../../modules/nf-core/bwamem2/mem/main' +include { BWA_MEM as BWAMEM1_MEM } from '../../../modules/nf-core/bwa/mem/main' +include { DRAGMAP_ALIGN } from '../../../modules/nf-core/dragmap/align/main' + +workflow FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP { + take: + reads // channel: [mandatory] meta, reads + index // channel: [mandatory] index + sort // boolean: [mandatory] true -> sort, false -> don't sort + + main: + + versions = Channel.empty() + reports = Channel.empty() + + // Only one of the following should be run + BWAMEM1_MEM(reads, index.map{ it -> [ [ id:'index' ], it ] }, sort) // If aligner is bwa-mem + BWAMEM2_MEM(reads, index.map{ it -> [ [ id:'index' ], it ] }, sort) // If aligner is bwa-mem2 + DRAGMAP_ALIGN(reads, index.map{ it -> [ [ id:'index' ], it ] }, sort) // If aligner is dragmap + + // Get the bam files from the aligner + // Only one aligner is run + bam = Channel.empty() + bam = bam.mix(BWAMEM1_MEM.out.bam) + bam = bam.mix(BWAMEM2_MEM.out.bam) + bam = bam.mix(DRAGMAP_ALIGN.out.bam) + + // Gather reports of all tools used + reports = reports.mix(DRAGMAP_ALIGN.out.log) + + // Gather versions of all tools used + versions = versions.mix(BWAMEM1_MEM.out.versions) + versions = versions.mix(BWAMEM2_MEM.out.versions) + versions = versions.mix(DRAGMAP_ALIGN.out.versions) + + emit: + bam // channel: [ [meta], bam ] + reports + versions // channel: [ versions.yml ] +} \ No newline at end of file diff --git a/subworkflows/local/mapping.nf b/subworkflows/local/mapping.nf deleted file mode 100644 index d4b0c65..0000000 --- a/subworkflows/local/mapping.nf +++ /dev/null @@ -1,199 +0,0 @@ -include { BAM_CONVERT_SAMTOOLS as CONVERT_FASTQ_INPUT } from '../nf-core/alignment_to_fastq' -include { RUN_FASTQC } from '../nf-core/run_fastqc' -include { FASTP } from '../../modules/nf-core/modules/fastp/main' -include { GATK4_MAPPING } from '../nf-core/gatk4/mapping/main' -include { ALIGN_STAR } from '../nf-core/align_star' -include { BAM_MERGE_INDEX_SAMTOOLS } from '../nf-core/merge_index_bam' -include { MAPPING_CSV } from '../local/mapping_csv' - - - -workflow MAPPING { - - take: - bwa - bwamem2 - dragmap - star_index - gtf - ch_input_sample - - main: - ch_reports = Channel.empty() - ch_versions = Channel.empty() - - // Gather index for mapping given the chosen aligner - ch_map_index = params.aligner == "bwa-mem" ? bwa : params.aligner == "bwa-mem2" ? bwamem2 : dragmap - - if (params.step in ['mapping']) { - // Separate input in bam or fastq - ch_input_sample.branch{ - bam: it[0].data_type == "bam" - fastq: it[0].data_type == "fastq" - }.set{ch_input_sample_type} - - // STEP 1.A: convert any bam input to fastq - CONVERT_FASTQ_INPUT(ch_input_sample_type.bam, - [ [ id:"fasta" ], [] ], // fasta - [ [ id:'null' ], [] ], // fasta_fai - false - ) - ch_versions = ch_versions.mix(CONVERT_FASTQ_INPUT.out.versions) - // gather fastq (from input or converted with BAM_TO_FASTQ) - // Theorically this could work on mixed input (fastq for one sample and bam for another) - ch_input_fastq = ch_input_sample_type.fastq.mix(CONVERT_FASTQ_INPUT.out.reads) - - - // STEP 1.B: QC - if (!(params.skip_tools && params.skip_tools.split(',').contains('fastqc'))) { - RUN_FASTQC(ch_input_fastq) - ch_reports = ch_reports.mix(RUN_FASTQC.out.fastqc_zip.collect{meta, logs -> logs}) - ch_versions = ch_versions.mix(RUN_FASTQC.out.versions) - } - - - // STEP 1.C: Trimming and/or splitting - if (params.trim_fastq || params.split_fastq > 0) { - // Call FASTP for trimming - FASTP(ch_input_fastq, params.save_trimmed_fail, params.save_merged_fastq) - ch_reports = ch_reports.mix( - FASTP.out.json.collect{meta, json -> json}, - FASTP.out.html.collect{meta, html -> html} - ) - // Map channel by split group - if(params.split_fastq){ - ch_reads_to_map = FASTP.out.reads.map{ key, reads -> - read_files = reads.collate(2) // removed sorting because gace concurrent error - it looks like files are sorted already - [[ - data_type:key.data_type, - id:key.id, - numLanes:key.numLanes, - patient: key.patient, - read_group:key.read_group, - sample:key.sample, - size:read_files.size(), - status:key.status, - ], - read_files] - }.transpose() - } else { - ch_reads_to_map = FASTP.out.reads - } - - ch_versions = ch_versions.mix(FASTP.out.versions) - } else { - ch_reads_to_map = ch_input_fastq - } - - - // STEP 1.D: MAPPING READS TO REFERENCE GENOME - // Generate mapped reads channel for alignment - ch_reads_to_map = ch_reads_to_map.map{ meta, reads -> - // update ID when no multiple lanes or splitted fastqs - new_id = meta.size * meta.numLanes == 1 ? meta.sample : meta.id - - [[ - data_type: meta.data_type, - id: new_id, - numLanes: meta.numLanes, - patient: meta.patient, - read_group: meta.read_group, - sample: meta.sample, - size: meta.size, - status: meta.status, - ], - reads] - } - // Separate DNA from RNA samples, DNA samples will be aligned with bwa, and RNA samples with star - ch_reads_to_map.branch{ - dna: it[0].status < 2 - rna: it[0].status == 2 - }.set{ch_reads_to_map_status} - - // STEP 1.D.1: DNA mapping with BWA - sort_bam = true // TODO: set up as parameter - GATK4_MAPPING(ch_reads_to_map_status.dna, ch_map_index, sort_bam) - ch_versions = ch_versions.mix(GATK4_MAPPING.out.versions) - // Grouping the bams from the same samples - bwa_bams = GATK4_MAPPING.out.bam - ch_bam_mapped_dna = GATK4_MAPPING.out.bam.map{ meta, bam -> - [ groupKey( meta - meta.subMap('num_lanes', 'read_group', 'size') + [ data_type:'bam', id:meta.sample ], (meta.num_lanes ?: 1) * (meta.size ?: 1)), bam ] - - }.groupTuple() - - - // RNA will be aligned with STAR - // ch_reads_to_map_rna = ch_reads_to_map_status.rna.map{ meta, reads -> [meta, reads] } - // STAR - ALIGN_STAR ( - ch_reads_to_map_status.rna, - star_index, - gtf, - params.star_ignore_sjdbgtf, - params.seq_platform ? params.seq_platform : [], - params.seq_center ? params.seq_center : [], - [ [ id:"fasta" ], [] ] // fasta - ) - // Grouping the bams from the same samples not to stall the workflow - star_bams = ALIGN_STAR.out.bam.groupTuple(sort: true) - ch_bam_mapped_rna = ALIGN_STAR.out.bam.map{ meta, bam -> - [ groupKey( meta - meta.subMap('num_lanes', 'read_group', 'size') + [ data_type:'bam', id:meta.sample ], (meta.num_lanes ?: 1) * (meta.size ?: 1)), bam ] - }.groupTuple() - // Gather QC reports - ch_reports = ch_reports.mix(ALIGN_STAR.out.stats.collect{it[1]}.ifEmpty([])) - ch_reports = ch_reports.mix(ALIGN_STAR.out.log_final.collect{it[1]}.ifEmpty([])) - ch_versions = ch_versions.mix(ALIGN_STAR.out.versions) - - // mix dna and rna in one channel - ch_bam_mapped = ch_bam_mapped_dna.mix(ch_bam_mapped_rna) - // Grouping the bams from the same samples not to stall the workflow - bam_mapped = ch_bam_mapped.map{ meta, bam -> - - // Update meta.id to be meta.sample, ditching sample-lane that is not needed anymore - // Update meta.data_type - // Remove no longer necessary fields: - // read_group: Now in the BAM header - // num_lanes: only needed for mapping - // size: only needed for mapping - - // Use groupKey to make sure that the correct group can advance as soon as it is complete - // and not stall the workflow until all reads from all channels are mapped - [ groupKey( meta - meta.subMap('num_lanes', 'read_group', 'size') + [ data_type:'bam', id:meta.sample ], (meta.num_lanes ?: 1) * (meta.size ?: 1)), - bam ] - }.groupTuple().map{meta, bam -> [meta, bam.flatten()]} - // gatk4 markduplicates can handle multiple bams as input, so no need to merge/index here - // Except if and only if skipping markduplicates or saving mapped bams - if (params.save_bam_mapped || (params.skip_tools && params.skip_tools.split(',').contains('markduplicates'))) { - // bams are merged (when multiple lanes from the same sample), indexed and then converted to cram - bam_mapped.dump(tag:"bam_mapped") - BAM_MERGE_INDEX_SAMTOOLS(bam_mapped) - // Create CSV to restart from this step - MAPPING_CSV(BAM_MERGE_INDEX_SAMTOOLS.out.bam_bai.transpose()) - - // Gather used softwares versions - ch_versions = ch_versions.mix(BAM_MERGE_INDEX_SAMTOOLS.out.versions) - } - } - else { - ch_input_sample.branch{ - bam: it[0].data_type == "bam" - fastq: it[0].data_type == "fastq" - }.set{ch_input_sample_type} - ch_bam_mapped = ch_input_sample_type.bam - - ch_bam_mapped.branch{ - rna: it[0].status >= 2 - dna: it[0].status < 2 - }.set{ch_input_sample_class} - star_bams = ch_input_sample_class.rna - bwa_bams = ch_input_sample_class.dna - } - - emit: - star_bams = star_bams //second pass with RG tags - bwa_bams = bwa_bams // second pass with RG tags - ch_bam_mapped = bam_mapped // for preprocessing - reports = ch_reports - versions = ch_versions - -} \ No newline at end of file diff --git a/subworkflows/local/mapping_csv.nf b/subworkflows/local/mapping_csv.nf deleted file mode 100644 index a9d8c32..0000000 --- a/subworkflows/local/mapping_csv.nf +++ /dev/null @@ -1,21 +0,0 @@ -// -// MAPPING_CSV -// - -workflow MAPPING_CSV { - take: - bam_indexed // channel: [mandatory] meta, bam, bai - - main: - // Creating csv files to restart from this step - bam_indexed.collectFile(keepHeader: true, skip: 1, sort: true, storeDir: "${params.outdir}/csv") { meta, bam, bai -> - id = meta.id - patient = meta.patient - sample = meta.sample - status = meta.status - lane = meta.lane - bam = "${params.outdir}/preprocessing/mapped/${patient}/${id}/${bam.name}" - bai = "${params.outdir}/preprocessing/mapped/${patient}/${id}/${bai.name}" - ["mapped.csv", "patient,status,sample,lane,fastq_1,fastq_2,bam,bai,cram,crai,table,vcf\n${patient},${status},${sample},${lane},,,${bam},${bai},,,,\n"] - } -} diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf deleted file mode 100644 index 43d9a58..0000000 --- a/subworkflows/local/prepare_genome.nf +++ /dev/null @@ -1,189 +0,0 @@ -// -// PREPARE GENOME -// - -// Initialize channels based on params or indices that were just built -// For all modules here: -// A when clause condition is defined in the conf/modules.config to determine if the module should be run -// Condition is based on params.step and params.tools -// If and extra condition exists, it's specified in comments - -include { BWA_INDEX as BWAMEM1_INDEX } from '../../modules/nf-core/modules/bwa/index/main' -include { BWAMEM2_INDEX } from '../../modules/nf-core/modules/bwamem2/index/main' -include { DRAGMAP_HASHTABLE } from '../../modules/nf-core/modules/dragmap/hashtable/main' -include { GTF2BED } from '../../modules/local/gtf2bed' //addParams(options: params.genome_options) -include { GUNZIP as GUNZIP_GENE_BED } from '../../modules/nf-core/modules/gunzip/main' //addParams(options: params.genome_options) -include { STAR_GENOMEGENERATE } from '../../modules/nf-core/modules/star/genomegenerate/main' //addParams(options: params.star_index_options) -include { GATK4_CREATESEQUENCEDICTIONARY } from '../../modules/nf-core/modules/gatk4/createsequencedictionary/main' -include { SAMTOOLS_FAIDX } from '../../modules/nf-core/modules/samtools/faidx/main' -include { TABIX_TABIX as TABIX_DBSNP } from '../../modules/nf-core/modules/tabix/tabix/main' -include { TABIX_TABIX as TABIX_GERMLINE_RESOURCE } from '../../modules/nf-core/modules/tabix/tabix/main' -include { TABIX_TABIX as TABIX_KNOWN_INDELS } from '../../modules/nf-core/modules/tabix/tabix/main' -include { TABIX_TABIX as TABIX_KNOWN_SNPS } from '../../modules/nf-core/modules/tabix/tabix/main' -include { TABIX_TABIX as TABIX_PON } from '../../modules/nf-core/modules/tabix/tabix/main' -include { UNZIP as UNZIP_ALLELES } from '../../modules/nf-core/modules/unzip/main' -include { UNZIP as UNZIP_LOCI } from '../../modules/nf-core/modules/unzip/main' -include { UNZIP as UNZIP_GC } from '../../modules/nf-core/modules/unzip/main' -include { UNZIP as UNZIP_RT } from '../../modules/nf-core/modules/unzip/main' -include { HISAT2_EXTRACTSPLICESITES } from '../../modules/nf-core/modules/hisat2/extractsplicesites/main' -include { HISAT2_BUILD } from '../../modules/nf-core/modules/hisat2/build/main' - -workflow PREPARE_GENOME { - take: - dbsnp // channel: [optional] dbsnp - fasta // channel: [mandatory] fasta - fasta_fai // channel: [optional] fasta_fai - germline_resource // channel: [optional] germline_resource - known_indels // channel: [optional] known_indels - known_snps - pon // channel: [optional] pon - - - main: - - ch_versions = Channel.empty() - - - // If aligner is bwa-mem - BWAMEM1_INDEX(fasta) // If aligner is bwa-mem - BWAMEM2_INDEX(fasta) // If aligner is bwa-mem2 - DRAGMAP_HASHTABLE(fasta) // If aligner is dragmap - - GATK4_CREATESEQUENCEDICTIONARY(fasta) - SAMTOOLS_FAIDX(fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, [['id':null], []]) - - // - // Uncompress GTF annotation file or create from GFF3 if required - // - ch_gffread_version = Channel.empty() - if (params.gtf) { - if (params.gtf.endsWith('.gz')) { - GUNZIP_GTF ( - Channel.fromPath(params.gtf).map{ it -> [[id:it[0].baseName], it] } - ) - ch_gtf = GUNZIP_GTF.out.gunzip.map{ meta, gtf -> [gtf] }.collect() - ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) - } else { - ch_gtf = Channel.fromPath(params.gtf).collect() - } - } else if (params.gff) { - if (params.gff.endsWith('.gz')) { - GUNZIP_GFF ( - Channel.fromPath(params.gff).map{ it -> [[id:it[0].baseName], it] } - ) - ch_gff = GUNZIP_GFF.out.gunzip.map{ meta, gff -> [gff] }.collect() - ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) - } else { - ch_gff = Channel.fromPath(params.gff).collect() - } - - GFFREAD ( - ch_gff - ) - .gtf - .set { ch_gtf } - - ch_versions = ch_versions.mix(GFFREAD.out.versions) - } - - // - // Uncompress exon BED annotation file or create from GTF if required - // - if (params.exon_bed) { - if (params.exon_bed.endsWith('.gz')) { - GUNZIP_GENE_BED ( - Channel.fromPath(params.exon_bed).map{ it -> [[id:it[0].baseName], it] } - ) - ch_gene_bed = GUNZIP_GENE_BED.out.gunzip.map{ meta, bed -> [bed] }.collect() - ch_versions = ch_versions.mix(GUNZIP_GENE_BED.out.versions) - } else { - ch_exon_bed = Channel.fromPath(params.exon_bed).collect() - } - } else { - ch_exon_bed = GTF2BED ( ch_gtf ).bed.collect() - ch_versions = ch_versions.mix(GTF2BED.out.versions) - } - - // - // Uncompress STAR index or generate from scratch if required - // - ch_star_index = Channel.empty() - if (params.star_index) { - if (params.star_index.endsWith('.tar.gz')) { - UNTAR_STAR_INDEX ( - Channel.fromPath(params.star_index).map{ it -> [[id:it[0].baseName], it] } - ) - ch_star_index = UNTAR_STAR_INDEX.out.untar.map{ meta, star_index -> [star_index] }.collect() - ch_versions = ch_versions.mix(UNTAR_STAR_INDEX.out.versions) - } else { - ch_star_index = Channel.fromPath(params.star_index).collect() - } - } - else { - STAR_GENOMEGENERATE ( - fasta,ch_gtf - ) - .index - .set { ch_star_index } - ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) - } - // HISAT mandatory for realignment TODO: make opt with arguments - if (params.splicesites) { - ch_splicesites = Channel.fromPath(params.splicesites).collect() - } else{ - ch_splicesites = HISAT2_EXTRACTSPLICESITES ( ch_gtf ).txt - ch_versions = ch_versions.mix(HISAT2_EXTRACTSPLICESITES.out.versions) - - } - - if (params.hisat2_index) { - ch_hisat2_index = Channel.fromPath(params.hisat2_index).collect() - } else{ - ch_hisat2_index = HISAT2_BUILD ( fasta, ch_gtf, ch_splicesites ).index - ch_versions = ch_versions.mix(HISAT2_BUILD.out.versions) - } - - - - - // the following are flattened and mapped in case the user supplies more than one value for the param - // [file1,file2] becomes [[meta1,file1],[meta2,file2]] - // outputs are collected to maintain a single channel for relevant TBI files - TABIX_DBSNP(dbsnp.flatten().map{ it -> [[id:it.baseName], it] }) - TABIX_GERMLINE_RESOURCE(germline_resource.flatten().map{ it -> [[id:it.baseName], it] }) - TABIX_KNOWN_SNPS( known_snps.flatten().map{ it -> [[id:it.baseName], it] } ) - TABIX_KNOWN_INDELS( known_indels.flatten().map{ it -> [[id:it.baseName], it] } ) - TABIX_PON(pon.flatten().map{ it -> [[id:it.baseName], it] }) - - - - // Gather versions of all tools used - ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions) - ch_versions = ch_versions.mix(BWAMEM1_INDEX.out.versions) - ch_versions = ch_versions.mix(BWAMEM2_INDEX.out.versions) - ch_versions = ch_versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions) - ch_versions = ch_versions.mix(TABIX_DBSNP.out.versions) - ch_versions = ch_versions.mix(TABIX_GERMLINE_RESOURCE.out.versions) - ch_versions = ch_versions.mix(TABIX_KNOWN_SNPS.out.versions) - ch_versions = ch_versions.mix(TABIX_KNOWN_INDELS.out.versions) - ch_versions = ch_versions.mix(TABIX_PON.out.versions) - - - emit: - bwa = BWAMEM1_INDEX.out.index // path: bwa/* - bwamem2 = BWAMEM2_INDEX.out.index // path: bwamem2/* - hashtable = DRAGMAP_HASHTABLE.out.hashmap // path: dragmap/* - dbsnp_tbi = TABIX_DBSNP.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: dbsnb.vcf.gz.tbi - dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict // path: genome.fasta.dict - fasta_fai = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] } // path: genome.fasta.fai - star_index = ch_star_index // path: star/index/ - gtf = ch_gtf // path: genome.gtf - exon_bed = ch_exon_bed // path: exon.bed - germline_resource_tbi = TABIX_GERMLINE_RESOURCE.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: germline_resource.vcf.gz.tbi - known_snps_tbi = TABIX_KNOWN_SNPS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi - known_indels_tbi = TABIX_KNOWN_INDELS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi - pon_tbi = TABIX_PON.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: pon.vcf.gz.tbi - hisat2_index = ch_hisat2_index // path: hisat2/index/ - splicesites = ch_splicesites // path: genome.splicesites.txt - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf new file mode 100644 index 0000000..1e8f21b --- /dev/null +++ b/subworkflows/local/prepare_genome/main.nf @@ -0,0 +1,189 @@ +// +// PREPARE GENOME +// + +// Initialize channels based on params or indices that were just built +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run +// Condition is based on params.step and params.tools +// If and extra condition exists, it's specified in comments + +include { BWA_INDEX as BWAMEM1_INDEX } from '../../../modules/nf-core/bwa/index/main' +include { BWAMEM2_INDEX } from '../../../modules/nf-core/bwamem2/index/main' +include { DRAGMAP_HASHTABLE } from '../../../modules/nf-core/dragmap/hashtable/main' +include { GTF2BED } from '../../../modules/local/gtf2bed' //addParams(options: params.genome_options) +include { GUNZIP as GUNZIP_GENE_BED } from '../../../modules/nf-core/gunzip/main' //addParams(options: params.genome_options) +include { STAR_GENOMEGENERATE } from '../../../modules/nf-core/star/genomegenerate/main' //addParams(options: params.star_index_options) +include { GATK4_CREATESEQUENCEDICTIONARY } from '../../../modules/nf-core/gatk4/createsequencedictionary/main' +include { SAMTOOLS_FAIDX } from '../../../modules/nf-core/samtools/faidx/main' +include { TABIX_TABIX as TABIX_DBSNP } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_GERMLINE_RESOURCE } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_KNOWN_INDELS } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_KNOWN_SNPS } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_PON } from '../../../modules/nf-core/tabix/tabix/main' +include { HISAT2_EXTRACTSPLICESITES } from '../../../modules/nf-core/hisat2/extractsplicesites/main' +include { HISAT2_BUILD } from '../../../modules/nf-core/hisat2/build/main' + +workflow PREPARE_GENOME { + take: + dbsnp // channel: [optional] dbsnp + fasta // channel: [mandatory] fasta + fasta_fai // channel: [optional] fasta_fai + germline_resource // channel: [optional] germline_resource + known_indels // channel: [optional] known_indels + known_snps // channel: [optional] known_snps + pon // channel: [optional] pon + + + main: + + fasta = fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] } + versions = Channel.empty() + + // If aligner is bwa-mem + BWAMEM1_INDEX(fasta) // If aligner is bwa-mem + BWAMEM2_INDEX(fasta) // If aligner is bwa-mem2 + DRAGMAP_HASHTABLE(fasta) // If aligner is dragmap + + + GATK4_CREATESEQUENCEDICTIONARY(fasta) + SAMTOOLS_FAIDX(fasta, [['id':null], []]) + + + // the following are flattened and mapped in case the user supplies more than one value for the param + // written for KNOWN_INDELS, but preemptively applied to the rest + // [ file1, file2 ] becomes [ [ meta1, file1 ], [ meta2, file2 ] ] + // outputs are collected to maintain a single channel for relevant TBI files + TABIX_DBSNP(dbsnp.flatten().map{ it -> [ [ id:it.baseName ], it ] }) + TABIX_GERMLINE_RESOURCE(germline_resource.flatten().map{ it -> [ [ id:it.baseName ], it ] }) + TABIX_KNOWN_SNPS(known_snps.flatten().map{ it -> [ [ id:it.baseName ], it ] } ) + TABIX_KNOWN_INDELS(known_indels.flatten().map{ it -> [ [ id:it.baseName ], it ] } ) + TABIX_PON(pon.flatten().map{ it -> [ [ id:it.baseName ], it ] }) + + // + // Uncompress GTF annotation file or create from GFF3 if required + // + ch_gffread_version = Channel.empty() + if (params.gtf) { + if (params.gtf.endsWith('.gz')) { + GUNZIP_GTF ( + Channel.fromPath(params.gtf).map{ it -> [[id:it[0].baseName], it] } + ) + ch_gtf = GUNZIP_GTF.out.gunzip.map{ meta, gtf -> [gtf] }.collect() + versions = versions.mix(GUNZIP_GTF.out.versions) + } else { + ch_gtf = Channel.fromPath(params.gtf).collect() + } + } else if (params.gff) { + if (params.gff.endsWith('.gz')) { + GUNZIP_GFF ( + Channel.fromPath(params.gff).map{ it -> [[id:it[0].baseName], it] } + ) + ch_gff = GUNZIP_GFF.out.gunzip.map{ meta, gff -> [gff] }.collect() + versions = versions.mix(GUNZIP_GFF.out.versions) + } else { + ch_gff = Channel.fromPath(params.gff).collect() + } + + GFFREAD ( + ch_gff + ) + .gtf + .set { ch_gtf } + + versions = versions.mix(GFFREAD.out.versions) + } + + // + // Uncompress exon BED annotation file or create from GTF if required + // + if (params.exon_bed) { + if (params.exon_bed.endsWith('.gz')) { + GUNZIP_GENE_BED ( + Channel.fromPath(params.exon_bed).map{ it -> [[id:it[0].baseName], it] } + ) + ch_gene_bed = GUNZIP_GENE_BED.out.gunzip.map{ meta, bed -> [bed] }.collect() + versions = versions.mix(GUNZIP_GENE_BED.out.versions) + } else { + ch_gene_bed = Channel.fromPath(params.exon_bed).collect() + } + } else { + ch_exon_bed = GTF2BED ( ch_gtf ).bed.collect() + versions = versions.mix(GTF2BED.out.versions) + } + + // + // Uncompress STAR index or generate from scratch if required + // + ch_star_index = Channel.empty() + if (params.star_index) { + if (params.star_index.endsWith('.tar.gz')) { + UNTAR_STAR_INDEX ( + Channel.fromPath(params.star_index).map{ it -> [[id:it[0].baseName], it] } + ) + ch_star_index = UNTAR_STAR_INDEX.out.untar.map{ meta, star_index -> [star_index] }.collect() + versions = versions.mix(UNTAR_STAR_INDEX.out.versions) + } else { + ch_star_index = Channel.fromPath(params.star_index).collect() + } + } + else { + STAR_GENOMEGENERATE ( fasta.map{meta, fasta -> fasta},ch_gtf ) + ch_star_index = STAR_GENOMEGENERATE.out.index + versions = versions.mix(STAR_GENOMEGENERATE.out.versions) + } + + + // HISAT2 not necessary if second pass skipped + if (!params.skip_tools.split(',').contains("second_pass")){ + if (params.splicesites) { + ch_splicesites = Channel.fromPath(params.splicesites).collect() + } else{ + ch_splicesites = HISAT2_EXTRACTSPLICESITES ( ch_gtf ).txt + versions = versions.mix(HISAT2_EXTRACTSPLICESITES.out.versions) + + } + + if (params.hisat2_index) { + ch_hisat2_index = Channel.fromPath(params.hisat2_index).collect() + } else{ + ch_hisat2_index = HISAT2_BUILD ( fasta, ch_gtf, ch_splicesites ).index + versions = versions.mix(HISAT2_BUILD.out.versions) + } + } else { + ch_hisat2_index = Channel.empty() + ch_splicesites = Channel.empty() + } + + + // Gather versions of all tools used + versions = versions.mix(SAMTOOLS_FAIDX.out.versions) + versions = versions.mix(BWAMEM1_INDEX.out.versions) + versions = versions.mix(BWAMEM2_INDEX.out.versions) + versions = versions.mix(DRAGMAP_HASHTABLE.out.versions) + versions = versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions) + versions = versions.mix(TABIX_DBSNP.out.versions) + versions = versions.mix(TABIX_GERMLINE_RESOURCE.out.versions) + versions = versions.mix(TABIX_KNOWN_SNPS.out.versions) + versions = versions.mix(TABIX_KNOWN_INDELS.out.versions) + versions = versions.mix(TABIX_PON.out.versions) + + + emit: + bwa = BWAMEM1_INDEX.out.index.map{ meta, index -> [index] }.collect() // path: bwa/* + bwamem2 = BWAMEM2_INDEX.out.index.map{ meta, index -> [index] }.collect() // path: bwamem2/* + hashtable = DRAGMAP_HASHTABLE.out.hashmap.map{ meta, index -> [index] }.collect() // path: dragmap/* + dbsnp_tbi = TABIX_DBSNP.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: dbsnb.vcf.gz.tbi + dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict // path: genome.fasta.dict + fasta_fai = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] } // path: genome.fasta.fai + germline_resource_tbi = TABIX_GERMLINE_RESOURCE.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: germline_resource.vcf.gz.tbi + known_snps_tbi = TABIX_KNOWN_SNPS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi + known_indels_tbi = TABIX_KNOWN_INDELS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi + pon_tbi = TABIX_PON.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: pon.vcf.gz.tbi + star_index = ch_star_index // path: star/index/ + gtf = ch_gtf // path: genome.gtf + exon_bed = ch_exon_bed // path: exon.bed + hisat2_index = ch_hisat2_index // path: hisat2/index/ + splicesites = ch_splicesites // path: genome.splicesites.txt + versions = versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/prepare_intervals.nf b/subworkflows/local/prepare_intervals/main.nf similarity index 90% rename from subworkflows/local/prepare_intervals.nf rename to subworkflows/local/prepare_intervals/main.nf index 2a84942..2d5e42b 100644 --- a/subworkflows/local/prepare_intervals.nf +++ b/subworkflows/local/prepare_intervals/main.nf @@ -1,11 +1,11 @@ // // PREPARE INTERVALS // -include { BUILD_INTERVALS } from '../../modules/local/build_intervals/main' -include { CREATE_INTERVALS_BED } from '../../modules/local/create_intervals_bed/main' -include { GATK4_INTERVALLISTTOBED } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main' -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../modules/nf-core/modules/tabix/bgziptabix/main' -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_COMBINED } from '../../modules/nf-core/modules/tabix/bgziptabix/main' +include { BUILD_INTERVALS } from '../../../modules/local/build_intervals/main' +include { CREATE_INTERVALS_BED } from '../../../modules/local/create_intervals_bed/main' +include { GATK4_INTERVALLISTTOBED } from '../../../modules/nf-core/gatk4/intervallisttobed/main' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../../modules/nf-core/tabix/bgziptabix/main' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_COMBINED } from '../../../modules/nf-core/tabix/bgziptabix/main' workflow PREPARE_INTERVALS { take: @@ -19,6 +19,7 @@ workflow PREPARE_INTERVALS { intervals_bed = Channel.empty() // List of [ bed, num_intervals ], one for each region intervals_bed_gz_tbi = Channel.empty() // List of [ bed.gz, bed,gz.tbi, num_intervals ], one for each region intervals_combined = Channel.empty() // Single bed file containing all intervals + if (no_intervals) { file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" file("${params.outdir}/no_intervals.bed.gz").text = "no_intervals\n" @@ -43,7 +44,6 @@ workflow PREPARE_INTERVALS { } else { intervals_combined = Channel.fromPath(file(intervals)).map{it -> [ [ id:it.baseName ], it ] } intervals_bed = CREATE_INTERVALS_BED(file(intervals)).bed - intervals_bed.dump(tag:"intervals_bed") versions = versions.mix(CREATE_INTERVALS_BED.out.versions) diff --git a/subworkflows/local/prepare_reference_and_intervals.nf b/subworkflows/local/prepare_reference_and_intervals.nf index 09f63fe..9e4ec35 100644 --- a/subworkflows/local/prepare_reference_and_intervals.nf +++ b/subworkflows/local/prepare_reference_and_intervals.nf @@ -1,10 +1,10 @@ // // PREPARE REFERENCE AND INTERVAL FILES FOR PIPELINE // -include { PREPARE_GENOME } from './prepare_genome' -include { PREPARE_INTERVALS } from './prepare_intervals' -include { GATK4_BEDTOINTERVALLIST } from '../../modules/nf-core/modules/gatk4/bedtointervallist/main' -include { GATK4_INTERVALLISTTOOLS } from '../../modules/nf-core/modules/gatk4/intervallisttools/main' +include { PREPARE_GENOME } from './prepare_genome/main' +include { PREPARE_INTERVALS } from './prepare_intervals/main' +include { GATK4_BEDTOINTERVALLIST } from '../../modules/nf-core/gatk4/bedtointervallist/main' +include { GATK4_INTERVALLISTTOOLS } from '../../modules/nf-core/gatk4/intervallisttools/main' workflow PREPARE_REFERENCE_AND_INTERVALS { @@ -90,13 +90,13 @@ workflow PREPARE_REFERENCE_AND_INTERVALS { } emit: - fasta = fasta - fasta_fai = fasta_fai - dict = dict - bwa = bwa - germline_resource = germline_resource - germline_resource_tbi = germline_resource_tbi - bwamem2 = bwamem2 + fasta = fasta + fasta_fai = fasta_fai + dict = dict + bwa = bwa + germline_resource = germline_resource + germline_resource_tbi = germline_resource_tbi + bwamem2 = bwamem2 dragmap = dragmap star_index = PREPARE_GENOME.out.star_index gtf = PREPARE_GENOME.out.gtf diff --git a/subworkflows/nf-core/align_star.nf b/subworkflows/nf-core/align_star.nf deleted file mode 100644 index 979cd3c..0000000 --- a/subworkflows/nf-core/align_star.nf +++ /dev/null @@ -1,55 +0,0 @@ -// -// Alignment with STAR -// - -include { STAR_ALIGN } from '../../modules/nf-core/modules/star/align/main' -include { BAM_SORT_SAMTOOLS } from './bam_sort_samtools' - -workflow ALIGN_STAR { - take: - reads // channel: [ val(meta), [ reads ] ] - index // channel: /path/to/star/index/ - gtf // channel: /path/to/genome.gtf - star_ignore_sjdbgtf // value: ignore gtf - seq_platform // value: sequencing platform - seq_center // value: sequencing centre - fasta - - main: - - ch_versions = Channel.empty() - // Map reads with STAR - STAR_ALIGN ( - reads, - index, - gtf, - star_ignore_sjdbgtf, - seq_platform, - seq_center - ) - ch_versions = ch_versions.mix(STAR_ALIGN.out.versions.first()) - // Sort, index BAM file and run samtools stats, flagstat and idxstats - BAM_SORT_SAMTOOLS ( - STAR_ALIGN.out.bam, - fasta - ) - ch_versions = ch_versions.mix(BAM_SORT_SAMTOOLS.out.versions.first()) - - emit: - orig_bam = STAR_ALIGN.out.bam // channel: [ val(meta), bam ] - log_final = STAR_ALIGN.out.log_final // channel: [ val(meta), log_final ] - log_out = STAR_ALIGN.out.log_out // channel: [ val(meta), log_out ] - log_progress = STAR_ALIGN.out.log_progress // channel: [ val(meta), log_progress ] - bam_sorted = STAR_ALIGN.out.bam_sorted // channel: [ val(meta), bam_sorted ] - bam_transcript = STAR_ALIGN.out.bam_transcript // channel: [ val(meta), bam_transcript ] - fastq = STAR_ALIGN.out.fastq // channel: [ val(meta), fastq ] - tab = STAR_ALIGN.out.tab // channel: [ val(meta), tab ] - bam = BAM_SORT_SAMTOOLS.out.bam // channel: [ val(meta), [ bam ] ] - bai = BAM_SORT_SAMTOOLS.out.bai // channel: [ val(meta), [ bai ] ] - csi = BAM_SORT_SAMTOOLS.out.csi // channel: [ val(meta), [ csi ] ] - stats = BAM_SORT_SAMTOOLS.out.stats // channel: [ val(meta), [ stats ] ] - flagstat = BAM_SORT_SAMTOOLS.out.flagstat // channel: [ val(meta), [ flagstat ] ] - idxstats = BAM_SORT_SAMTOOLS.out.idxstats // channel: [ val(meta), [ idxstats ] ] - versions = ch_versions // channel: [ versions.yml ] - -} diff --git a/subworkflows/nf-core/bam_sort_samtools.nf b/subworkflows/nf-core/bam_sort_samtools.nf index 8d8a347..c071268 100644 --- a/subworkflows/nf-core/bam_sort_samtools.nf +++ b/subworkflows/nf-core/bam_sort_samtools.nf @@ -2,9 +2,9 @@ // Sort, index BAM file and run samtools stats, flagstat and idxstats // -include { SAMTOOLS_SORT } from '../../modules/nf-core/modules/samtools/sort/main' -include { SAMTOOLS_INDEX } from '../../modules/nf-core/modules/samtools/index/main' -include { BAM_STATS_SAMTOOLS } from './bam_stats_samtools' +include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' +include { BAM_STATS_SAMTOOLS } from './bam_stats_samtools/main' workflow BAM_SORT_SAMTOOLS { take: diff --git a/subworkflows/nf-core/merge_index_bam.nf b/subworkflows/nf-core/merge_index_bam.nf deleted file mode 100644 index b260507..0000000 --- a/subworkflows/nf-core/merge_index_bam.nf +++ /dev/null @@ -1,45 +0,0 @@ -// -// MERGE INDEX BAM -// -// For all modules here: -// A when clause condition is defined in the conf/modules.config to determine if the module should be run - -include { SAMTOOLS_INDEX as INDEX_MERGE_BAM } from '../../modules/nf-core/modules/samtools/index/main' -include { SAMTOOLS_MERGE as MERGE_BAM } from '../../modules/nf-core/modules/samtools/merge/main' - -workflow BAM_MERGE_INDEX_SAMTOOLS { - take: - bam // channel: [mandatory] meta, bam - - main: - versions = Channel.empty() - - // Figuring out if there is one or more bam(s) from the same sample - bam_to_merge = bam.branch{ meta, bam -> - // bam is a list, so use bam.size() to asses number of intervals - single: bam.size() <= 1 - return [ meta, bam[0] ] - multiple: bam.size() > 1 - } - - // Only when using intervals - MERGE_BAM(bam_to_merge.multiple, [ [ id:'null' ], []], [ [ id:'null' ], []]) - - // Mix intervals and no_intervals channels together - bam_all = MERGE_BAM.out.bam.mix(bam_to_merge.single) - - // Index bam - INDEX_MERGE_BAM(bam_all) - - // Join with the bai file - bam_bai = bam_all.join(INDEX_MERGE_BAM.out.bai, failOnDuplicate: true, failOnMismatch: true) - - // Gather versions of all tools used - versions = versions.mix(INDEX_MERGE_BAM.out.versions) - versions = versions.mix(MERGE_BAM.out.versions) - - emit: - bam_bai - - versions -} \ No newline at end of file