🔀 Merge pull request #93 from cnr-ibba/issue-92

✨ support normalization only subworkflow
cnr-ibba · Feb 10, 2025 · 7291160 · 7291160
2 parents 02e6160 + fed5b7c
commit 7291160
Show file tree

Hide file tree

Showing 10 changed files with 193 additions and 48 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## 0.6.2 - dev
 
-- Update freebayes to version `1.3.8`
+- Update `nextflow_schema.json` to check for _file and directory_ existence
+- Perform only the normalization workflow using `--normalization_only` parameter ([#92](https://github.com/cnr-ibba/nf-resequencing-mem/issues/92))
+- Update freebayes to version `1.3.8` ([#88](https://github.com/cnr-ibba/nf-resequencing-mem/issues/88))
 - Solve linter issues related to VScode and _nextflow languageserver_ plugin ([#86](https://github.com/cnr-ibba/nf-resequencing-mem/issues/86))
 - Update `.editorconfig`
 - Update modules
@@ -16,7 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Support for institutional configuration
 - Parallelize normalization steps by chromosomes. Merge VCF files after normalization
 - Normalize VCF file using `vcfwave` ([#76](https://github.com/cnr-ibba/nf-resequencing-mem/issues/76))
-- Add `freebayes_normalized` local subworkflow
+- Add `normalize_vcf` local subworkflow
 - Update `nextflow` to version `24.04.0`
 - Using the `resourceLimits` directive to set the max requirements for each process
 - Update CI system ([#81](https://github.com/cnr-ibba/nf-resequencing-mem/issues/81))
@@ -31,11 +33,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add `bcftools/concat` process from _nf-core_ repository
 - Add `bcftools_filltags` process
 - Add `vcflib_vcfwave` local process
-- Add `freebayes_normalized` local subworkflow
+- Add `normalize_vcf` local subworkflow
 - Add `bcftools/sort` process
 
 ### `Fixed`
 
+- Enforce parameters validation through `nextflow_schema.json`
 - Combine _freebayes_ results if `--save-freebayes` parameter is set
 - Rename `bcftools/concat` steps in more informative way
 - Use remote files with `test` profile

diff --git a/README.md b/README.md
@@ -120,7 +120,11 @@ used to save _intermediate results_ or to skip a particular step:
 - `--gvcf_dont_use_chunk`: (bool, def. false) When writing the gVCF output emit a
   record for all bases, will also route an int to `--gvcf_chunk` similar to
   `--output-mode EMIT_ALL_SITES` from _GATK_
-  `--skip_normalization`: (bool, def. false) skip VCF normalization steps
+- `--skip_normalization`: (bool, def. false) skip VCF normalization steps
+- `--normalization_only`: (bool, def. false) only normalize a VCF file (skip all the
+  other steps, see [Normalize a vcf file](#normalize-a-vcf-file))
+- `--input_vcf`: path to a VCF file to be normalized (required when `--normalization_only` is set)
+- `--input_tbi`: path to a VCF index file (required when `--normalization_only` is set)
 - `--snpeff_database`: annotate the VCF file with SnpEff by providing a pre-built
   database that can be found using the `java -jar snpEff.jar databases` command.
   If the database is known to SnpEff will be downloaded and managed by the pipeline
@@ -365,6 +369,23 @@ Please see the [Amazon Cloud](https://www.nextflow.io/docs/latest/awscloud.html#
 section of nextflow documentation to get other information on nextflow and AWS
 usage.
 
+## Normalize a vcf file
+
+With this pipeline is it possible to perform the normalization workflow on a VCF
+file, without running the whole pipeline. This is useful when you have a VCF file
+that needs to be normalized, for example after a _freebayes_ run. You can call
+this pipeline providing the `--normalization_only` parameter and the `--input_vcf`
+and `--input_tbi` parameters:
+
+```bash
+nextflow run cnr-ibba/nf-resequencing-mem -resume -profile <your profile> \
+  --normalization_only --input_vcf <input.vcf> --input_tbi <input.tbi> \
+  --genome_fasta <genome.fasta> --outdir <results dir>
+```
+
+Other provided parameters will be ignored, and the pipeline will normalize the
+VCF file and will store the normalized VCF file in the `outdir` directory.
+
 ## Known issues
 
 ### Ignore sample sheet check

diff --git a/conf/modules.config b/conf/modules.config
@@ -216,7 +216,10 @@ process {
 
     withName: "BCFTOOLS_FILLTAGS|BCFTOOLS_FILLTAGS_TABIX" {
         publishDir = [
-            enabled: false
+            path: { "${params.outdir}/normalized-vcf" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+            enabled: params.normalization_only
         ]
     }
 

diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy
@@ -44,16 +44,29 @@ class WorkflowMain {
         // Check AWS batch settings
         NfcoreTemplate.awsBatch(workflow, params)
 
-        // Check input has been provided
-        if (!params.input) {
-            Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'")
+        // Check at least one input has been provided
+        if (!params.normalization_only) {
+            // check for mandatory input
+            if (!params.input) {
+                Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'")
+            }
+
+            // check for gvcf_chunk options and gvcf
+            if ((params.gvcf_chunk || params.gvcf_dont_use_chunk) && !params.gvcf) {
+                Nextflow.error("Please provide '--gvcf' option when providing '--gvcf_chunk' or '--gvcf_dont_use_chunk' parameters")
+            } else if (params.gvcf_chunk && params.gvcf_dont_use_chunk) {
+                Nextflow.error("Please provide only one of '--gvcf_chunk' or '--gvcf_dont_use_chunk' parameters")
+            }
         }
 
-        // check for gvcf_chunk options and gvcf
-        if ((params.gvcf_chunk || params.gvcf_dont_use_chunk) && !params.gvcf) {
-            Nextflow.error("Please provide '--gvcf' option when providing '--gvcf_chunk' or '--gvcf_dont_use_chunk' parameters")
-        } else if (params.gvcf_chunk && params.gvcf_dont_use_chunk) {
-            Nextflow.error("Please provide only one of '--gvcf_chunk' or '--gvcf_dont_use_chunk' parameters")
+        // doing the normalization workflow
+        if (params.normalization_only) {
+            if (!params.input_vcf || !params.input_tbi) {
+                Nextflow.error("Please provide a VCF file and its index to the pipeline e.g. '--input_vcf input.vcf --input_tbi input.vcf.tbi' when using '--normalization_only'")
+            }
+            if (params.input) {
+                log.warn("You choose to run the normalization workflow. The input samplesheet will be ignored.")
+            }
         }
     }
 

diff --git a/main.nf b/main.nf
@@ -16,15 +16,18 @@ nextflow.enable.dsl = 2
 */
 
 include { validateParameters; paramsHelp  } from 'plugin/nf-validation'
-include { PIPELINE_INITIALIZATION         } from './subworkflows/local/pipeline_initialization.nf'
+include { PIPELINE_INITIALIZATION         } from './subworkflows/local/pipeline_initialization'
+include { NORMALIZATION_INITIALIZATION    } from './subworkflows/local/pipeline_initialization'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     NAMED WORKFLOW FOR PIPELINE
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
-include { RESEQUENCING_MEM } from './workflows/resequencing-mem'
+include { RESEQUENCING_MEM              } from './workflows/resequencing-mem'
+include { NORMALIZE_VCF                 } from './subworkflows/local/normalize_vcf'
+include { CUSTOM_DUMPSOFTWAREVERSIONS   } from './modules/nf-core/custom/dumpsoftwareversions/main'
 
 //
 // WORKFLOW: Run main cnr-ibba/nf-resequencing-mem analysis pipeline
@@ -42,9 +45,33 @@ workflow CNR_IBBA {
     multiqc_report = RESEQUENCING_MEM.out.multiqc_report // channel: /path/to/multiqc_report.html
 }
 
+workflow VCF_NORMALIZE {
+    take:
+    vcf_ch // channel: vcf file
+    tbi_ch // channel: tbi file
+    fasta_ch // channel: fasta file
+
+    main:
+    // collect software version
+    ch_versions = Channel.empty()
+
+    // calling the normalization workflow
+    NORMALIZE_VCF(
+        vcf_ch,
+        tbi_ch,
+        fasta_ch
+    )
+    ch_versions = ch_versions.mix(NORMALIZE_VCF.out.versions)
+
+    // return software version
+    CUSTOM_DUMPSOFTWAREVERSIONS (
+        ch_versions.unique().collectFile(name: 'collated_versions.yml')
+    )
+}
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    RUN ALL WORKFLOWS
+    RUN CNR_IBBA:RESEQUENCING_MEM WORKFLOWS
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
@@ -69,21 +96,36 @@ workflow {
         validateParameters()
     }
 
+    // Initialize the workflow and check specific parameters
     WorkflowMain.initialise(workflow, params, log)
 
-    //
-    // SUBWORKFLOW: Run initializations tasks
-    //
-    PIPELINE_INITIALIZATION (
-        params.input,
-        params.multiqc_config,
-        params.genome_fasta,
-        params.genome_bwa_index
-    )
-
-    CNR_IBBA (
-        PIPELINE_INITIALIZATION.out.samplesheet
-    )
+    if (!params.normalization_only) {
+        // doing the main analysis
+        // Run initializations tasks
+        PIPELINE_INITIALIZATION (
+            params.input
+        )
+
+        // then run the main pipeline
+        CNR_IBBA (
+            PIPELINE_INITIALIZATION.out.samplesheet
+        )
+    } else {
+        // doing only the normalization workflow
+        // setting up
+        NORMALIZATION_INITIALIZATION(
+            params.input_vcf,
+            params.input_tbi,
+            params.genome_fasta
+        )
+
+        // run only the normalization workflow
+        VCF_NORMALIZE (
+            NORMALIZATION_INITIALIZATION.out.vcf_ch,
+            NORMALIZATION_INITIALIZATION.out.tbi_ch,
+            NORMALIZATION_INITIALIZATION.out.fasta_ch
+        )
+    }
 }
 
 /*

diff --git a/nextflow.config b/nextflow.config
@@ -46,6 +46,11 @@ params {
     save_freebayes             = false
     save_unique_fastq          = false
 
+    // Normalization workflow options
+    normalization_only         = false
+    input_vcf                  = null
+    input_tbi                  = null
+
     // Boilerplate options
     outdir                     = './results'
     publish_dir_mode           = 'copy'

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -10,7 +10,7 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "genome_fasta"],
+            "required": ["genome_fasta"],
             "properties": {
                 "input": {
                     "type": "string",
@@ -25,6 +25,7 @@
                 "genome_fasta": {
                     "type": "string",
                     "format": "file-path",
+                    "exists": true,
                     "mimetype": "text/plain",
                     "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$",
                     "description": "Path to FASTA genome file (compression is supported).",
@@ -33,10 +34,14 @@
                 },
                 "genome_fasta_fai": {
                     "type": "string",
+                    "format": "file-path",
+                    "exists": true,
                     "description": "Path to genome fasta index (skip index calculation)"
                 },
                 "genome_bwa_index": {
                     "type": "string",
+                    "format": "directory-path",
+                    "exists": true,
                     "description": "Path to genome fasta BWA index (skip index calculation)"
                 },
                 "outdir": {
@@ -77,12 +82,16 @@
                 "snpeff_cachedir": {
                     "type": "string",
                     "description": "SnpEff custom cache directory",
+                    "format": "directory-path",
+                    "exists": true,
                     "fa_icon": "fas fa-database",
                     "help_text": "SnpEff cache directory which should include another directory with the name of the database in which a valid SnpEff custom database is stored"
                 },
                 "snpeff_config": {
                     "type": "string",
                     "description": "SnpEff custom configuration file",
+                    "format": "file-path",
+                    "exists": true,
                     "fa_icon": "fas fa-database",
                     "help_text": "SnpEff configuration file which should include the custom database name",
                     "default": "assets/NO_FILE"
@@ -133,7 +142,34 @@
                 },
                 "save_unique_fastq": {
                     "type": "boolean",
-                    "description": "Save purget FASTQ in ${results_dir}"
+                    "description": "Save purged FASTQ in ${results_dir}"
+                }
+            }
+        },
+        "normalization_workflow": {
+            "title": "Normalization workflow",
+            "type": "object",
+            "description": "Normalization workflow parameters",
+            "default": "",
+            "properties": {
+                "normalization_only": {
+                    "type": "boolean",
+                    "description": "Do only the VCF normalization workflow",
+                    "default": false
+                },
+                "input_vcf": {
+                    "type": "string",
+                    "format": "file-path",
+                    "exists": true,
+                    "description": "Path to VCF file to normalize",
+                    "fa_icon": "fas fa-file-code"
+                },
+                "input_tbi": {
+                    "type": "string",
+                    "format": "file-path",
+                    "exists": true,
+                    "description": "Path to VCF index file",
+                    "fa_icon": "fas fa-file-code"
                 }
             }
         },
@@ -313,6 +349,9 @@
         {
             "$ref": "#/definitions/pipeline_custom_parameters"
         },
+        {
+            "$ref": "#/definitions/normalization_workflow"
+        },
         {
             "$ref": "#/definitions/institutional_config_options"
         },

diff --git a/subworkflows/local/freebayes_normalize.nf → subworkflows/local/normalize_vcf.nf b/subworkflows/local/freebayes_normalize.nf → subworkflows/local/normalize_vcf.nf
@@ -12,7 +12,7 @@ include { BCFTOOLS_NORM                     } from '../../modules/nf-core/bcftoo
 include { BCFTOOLS_FILLTAGS                 } from '../../modules/local/bcftools_filltags'
 
 
-workflow FREEBAYES_NORMALIZE {
+workflow NORMALIZE_VCF {
     take:
         vcf_ch    // channel: [mandatory] the VCF file to normalize
         tbi_ch    // channel: [mandatory] the index file for the VCF file