diff --git a/.gitignore b/.gitignore index 8c51880..4d326e9 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,4 @@ test.csv test2.csv .nf-test .nf-test.log -nf-test-report.tap +nf-test-report* diff --git a/.zenodo.json b/.zenodo.json index d48d7e5..ce10389 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -1,6 +1,6 @@ { "upload_type": "software", - "description": "v0.6.1 Release. See https://github.com/TheJacksonLaboratory/cs-nf-pipelines/wiki", + "description": "See https://github.com/TheJacksonLaboratory/cs-nf-pipelines/wiki", "title": "cs-nf-pipelines", "creators": [ { @@ -36,6 +36,10 @@ "affiliation": "The Jackson Laboratory", "name": "Gabriel Rech" }, + { + "affiliation": "The Jackson Laboratory", + "name": "Ardian Ferraj" + }, { "affiliation": "The Jackson Laboratory", "name": "Anuj Srivastava" diff --git a/README.md b/README.md index 7946bd1..57b0b3e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.11068737.svg)](https://doi.org/10.5281/zenodo.11068737) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.11068736.svg)](https://doi.org/10.5281/zenodo.11068736) # JAX NGS Operations Nextflow DSL2 Pipelines diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 5484b84..bf97c6f 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -1,5 +1,30 @@ # RELEASE NOTES +## Release 0.6.3 + +In this release we change the read disambiguation tool Xenome for Xengsort. Extensive benchmarking shows high concordance among results obtained from both tools. + +Additionally, we correct an issue with the human PTA workflow when running the combination of the `--pdx` and `--split_fastq` options. Data run with this combination of options from version 0.6.0-0.6.2 should be re-run. + +### Pipelines Added: + +None + +### Modules Added: + +1. xengsort/xengsort_classify.nf +1. xengsort/xengsort_index.nf + +### Pipeline Changes: + +1. Xengsort replaces Xenome for all PDX based workflows (RNAseq, RNA fusion, Hs PTA, Somatic WES, Somatic WES PTA) +1. Correction made for the Human PTA when running the combination of the `--pdx` and `--split_fastq` options. + +### Module Changes: + +None + + ## Release 0.6.2 In this minor release we adjust memory and wall clock statements, and modified `bin/pta/merge-caller-vcfs.r` to correct for an edge case related bug. diff --git a/bin/help/pta.nf b/bin/help/pta.nf index 84249b5..44a8276 100644 --- a/bin/help/pta.nf +++ b/bin/help/pta.nf @@ -12,8 +12,10 @@ The following are human specific parameters. To see help for mouse, add `--gen_o --csv_input | / | CSV delimited sample sheet that controls how samples are processed. The required input header is: patient,sex,status,sampleID,lane,fastq_1,fastq_2. See the repository wiki (https://github.com/TheJacksonLaboratory/cs-nf-pipelines/wiki) for additional information. ---xenome_prefix | /projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/trans_human_GRCh38_84_NOD_based_on_mm10_k25| Xenome index for deconvolution of human and mouse reads. Used when `--pdx` is run. ---pdx | false | Options: false, true. If specified, 'Xenome' is run on reads to deconvolute human and mouse reads. Human only reads are used in analysis. +--pdx | false | Options: false, true. If specified, 'Xengsort' is run on reads to deconvolute human and mouse reads. Human only reads are used in analysis. +--xengsort_host_fasta | '/projects/compsci/omics_share/mouse/GRCm39/genome/sequence/imputed/rel_2112_v8/NOD_ShiLtJ.39.fa' | Xengsort host fasta file. Used by Xengsort Index when `--pdx` is run, and xengsort_idx_path is `null` or false. +--xengsort_idx_path | '/projects/compsci/omics_share/human/GRCh38/supporting_files/xengsort' | Xengsort index for deconvolution of human and mouse reads. Used when `--pdx` is run. If `null`, Xengsort Index is run using ref_fa and host_fa. +--xengsort_idx_name | 'hg38_GRCm39-NOD_ShiLtJ' | Xengsort index name associated with files located in `xengsort_idx_path` or name given to outputs produced by Xengsort Index --deduplicate_reads | false | Options: false, true. If specified, run bbmap clumpify on input reads. Clumpify will deduplicate reads prior to trimming. This can help with mapping and downstream steps when analyzing high coverage WGS data. diff --git a/bin/help/rna_fusion.nf b/bin/help/rna_fusion.nf index 073939a..00bbb40 100644 --- a/bin/help/rna_fusion.nf +++ b/bin/help/rna_fusion.nf @@ -17,7 +17,6 @@ Parameter | Default | Description --gen_org | mouse | Options: mouse and human. ---xenome_prefix | /projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/trans_human_GRCh38_84_NOD_based_on_mm10_k25| Xenome index for deconvolution of human and mouse reads. Used when `--pdx` is run. --read_length | 150 | Options: 75, 100, 150. Changed relative to sample read length. --star_index | /projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/star/star-2.7.4a-150bp | STAR index used by several tools. Change the index relative to sample read length. Read length options: 75, 100, 150. --star_fusion_star_index | /projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/starfusion/star-150 | STAR-fusion index. Change the index relative to sample read length. Read length options: 75, 100, 150. @@ -47,7 +46,11 @@ Parameter | Default | Description --fusion_report_opt | null | Additional fusion-report options can be provided. --databases | /projects/compsci/omics_share/human/GRCh38/supporting_files/rna_fusion_dbs | Fusion-report databases of known fusion events. Used in report generation only. ---pdx | false | Options: false, true. If specified, 'Xenome' is run on reads to deconvolute human and mouse reads. Human only reads are used in analysis. +--pdx | false | Options: false, true. If specified, 'Xengsort' is run on reads to deconvolute human and mouse reads. Human only reads are used in analysis. +--ref_fa | '/projects/compsci/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta'| Xengsort graft fasta file. Used by Xengsort Index when `--pdx` is run, and xengsort_idx_path is `null` or false. +--xengsort_host_fasta | '/projects/compsci/omics_share/mouse/GRCm39/genome/sequence/imputed/rel_2112_v8/NOD_ShiLtJ.39.fa' | Xengsort host fasta file. Used by Xengsort Index when `--pdx` is run, and xengsort_idx_path is `null` or false. +--xengsort_idx_path | '/projects/compsci/omics_share/human/GRCh38/supporting_files/xengsort' | Xengsort index for deconvolution of human and mouse reads. Used when `--pdx` is run. If `null`, Xengsort Index is run using ref_fa and host_fa. +--xengsort_idx_name | 'hg38_GRCm39-NOD_ShiLtJ' | Xengsort index name associated with files located in `xengsort_idx_path` or name given to outputs produced by Xengsort Index ''' } diff --git a/bin/help/rnaseq.nf b/bin/help/rnaseq.nf index 3cae8ea..5229183 100644 --- a/bin/help/rnaseq.nf +++ b/bin/help/rnaseq.nf @@ -18,9 +18,6 @@ Parameter | Default | Description --gen_org | mouse | Options: mouse and human. --genome_build | 'GRCm38' | Mouse specific. Options: GRCm38 or GRCm39. If gen_org == human, build defaults to GRCm38. ---pdx | false | Options: true or false. If 'true' Xenome is run to remove mouse reads from samples. ---xenome_prefix | '/projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/trans_human_GRCh38_84_NOD_based_on_mm10_k25' | Pre-compiled Xenome classification index files. Used if PDX analysis is specified. - --quality_phred | 15 | The quality value that is required for a base to pass. Default: 15 which is a phred quality score of >=Q15. --unqualified_perc | 40 | Percent of bases that are allowed to be unqualified (0~100). Default: 40 which is 40%. --detect_adapter_for_pe | false | If true, adapter auto-detection is used for paired end data. By default, paired-end data adapter sequence auto-detection is disabled as the adapters can be trimmed by overlap analysis. However, --detect_adapter_for_pe will enable it. Fastp will run a little slower if you specify the sequence adapters or enable adapter auto-detection, but usually result in a slightly cleaner output, since the overlap analysis may fail due to sequencing errors or adapter dimers. @@ -50,8 +47,13 @@ Parameter | Default | Description | Human: '/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr_patch_hapl_scaff.rRNA.interval_list' | The coverage metric calculation step requires this file. Refers to human assembly when --gen_org human. JAX users should not change this parameter. ---pdx | false | Options: false, true. If specified, 'Xenome' is run on reads to deconvolute human and mouse reads. Human only reads are used in analysis. +--pdx | false | Options: false, true. If specified, 'Xengsort' is run on reads to deconvolute human and mouse reads. Human only reads are used in analysis. --classifier_table | '/projects/compsci/omics_share/human/GRCh38/supporting_files/rna_ebv_classifier/EBVlym_classifier_table_48.txt' | EBV expected gene signatures used in EBV classifier. Only used when '--pdx' is run. +--ref_fa | '/projects/compsci/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta'| Xengsort graft fasta file. Used by Xengsort Index when `--pdx` is run, and xengsort_idx_path is `null` or false. +--xengsort_host_fasta | '/projects/compsci/omics_share/mouse/GRCm39/genome/sequence/imputed/rel_2112_v8/NOD_ShiLtJ.39.fa' | Xengsort host fasta file. Used by Xengsort Index when `--pdx` is run, and xengsort_idx_path is `null` or false. +--xengsort_idx_path | '/projects/compsci/omics_share/human/GRCh38/supporting_files/xengsort' | Xengsort index for deconvolution of human and mouse reads. Used when `--pdx` is run. If `null`, Xengsort Index is run using ref_fa and host_fa. +--xengsort_idx_name | 'hg38_GRCm39-NOD_ShiLtJ' | Xengsort index name associated with files located in `xengsort_idx_path` or name given to outputs produced by Xengsort Index + There are two additional parameters that are human specific. They are: diff --git a/bin/help/somatic_wes.nf b/bin/help/somatic_wes.nf index fd5704b..916b6c8 100644 --- a/bin/help/somatic_wes.nf +++ b/bin/help/somatic_wes.nf @@ -21,8 +21,10 @@ Parameter | Default | Description --unqualified_perc | 40 | Percent of bases that are allowed to be unqualified (0~100). Default: 40 which is 40%. --detect_adapter_for_pe | false | If true, adapter auto-detection is used for paired end data. By default, paired-end data adapter sequence auto-detection is disabled as the adapters can be trimmed by overlap analysis. However, --detect_adapter_for_pe will enable it. Fastp will run a little slower if you specify the sequence adapters or enable adapter auto-detection, but usually result in a slightly cleaner output, since the overlap analysis may fail due to sequencing errors or adapter dimers. ---pdx | false | Options: false, true. If specified, 'Xenome' is run on reads to deconvolute human and mouse reads. Human only reads are used in analysis. ---xenome_prefix | /projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/trans_human_GRCh38_84_NOD_based_on_mm10_k25| Xenome index for deconvolution of human and mouse reads. Used when `--pdx` is run. +--pdx | false | Options: false, true. If specified, 'Xengsort' is run on reads to deconvolute human and mouse reads. Human only reads are used in analysis. +--xengsort_host_fasta | '/projects/compsci/omics_share/mouse/GRCm39/genome/sequence/imputed/rel_2112_v8/NOD_ShiLtJ.39.fa' | Xengsort host fasta file. Used by Xengsort Index when `--pdx` is run, and xengsort_idx_path is `null` or false. +--xengsort_idx_path = | '/projects/compsci/omics_share/human/GRCh38/supporting_files/xengsort' | Xengsort index for deconvolution of human and mouse reads. Used when `--pdx` is run. If `null`, Xengsort Index is run using ref_fa and host_fa. +--xengsort_idx_name = | 'hg38_GRCm39-NOD_ShiLtJ' | Xengsort index name associated with files located in `xengsort_idx_path` or name given to outputs produced by Xengsort Index --genotype_targets | '/projects/compsci/omics_share/human/GRCh38/supporting_files/ancestry_panel/snp_panel_v2_targets_annotations.snpwt.bed.gz' | Target SNP bed file for the ancestry panel. Can contain annotation information. --snpID_list | '/projects/compsci/omics_share/human/GRCh38/supporting_files/ancestry_panel/snp_panel_v2.list' | Target SNPs in list used in BCFtools filtering step diff --git a/bin/help/somatic_wes_pta.nf b/bin/help/somatic_wes_pta.nf index 820c4af..f272e15 100644 --- a/bin/help/somatic_wes_pta.nf +++ b/bin/help/somatic_wes_pta.nf @@ -21,8 +21,10 @@ Parameter | Default | Description --unqualified_perc | 40 | Percent of bases that are allowed to be unqualified (0~100). Default: 40 which is 40%. --detect_adapter_for_pe | false | If true, adapter auto-detection is used for paired end data. By default, paired-end data adapter sequence auto-detection is disabled as the adapters can be trimmed by overlap analysis. However, --detect_adapter_for_pe will enable it. Fastp will run a little slower if you specify the sequence adapters or enable adapter auto-detection, but usually result in a slightly cleaner output, since the overlap analysis may fail due to sequencing errors or adapter dimers. ---pdx | false | Options: false, true. If specified, 'Xenome' is run on reads to deconvolute human and mouse reads. Human only reads are used in analysis. ---xenome_prefix | /projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/trans_human_GRCh38_84_NOD_based_on_mm10_k25| Xenome index for deconvolution of human and mouse reads. Used when `--pdx` is run. +--pdx | false | Options: false, true. If specified, 'Xengsort' is run on reads to deconvolute human and mouse reads. Human only reads are used in analysis. +--xengsort_host_fasta | '/projects/compsci/omics_share/mouse/GRCm39/genome/sequence/imputed/rel_2112_v8/NOD_ShiLtJ.39.fa' | Xengsort host fasta file. Used by Xengsort Index when `--pdx` is run, and xengsort_idx_path is `null` or false. +--xengsort_idx_path = | '/projects/compsci/omics_share/human/GRCh38/supporting_files/xengsort' | Xengsort index for deconvolution of human and mouse reads. Used when `--pdx` is run. If `null`, Xengsort Index is run using ref_fa and host_fa. +--xengsort_idx_name = | 'hg38_GRCm39-NOD_ShiLtJ' | Xengsort index name associated with files located in `xengsort_idx_path` or name given to outputs produced by Xengsort Index --genotype_targets | '/projects/compsci/omics_share/human/GRCh38/supporting_files/ancestry_panel/snp_panel_v2_targets_annotations.snpwt.bed.gz' | Target SNP bed file for the ancestry panel. Can contain annotation information. --snpID_list | '/projects/compsci/omics_share/human/GRCh38/supporting_files/ancestry_panel/snp_panel_v2.list' | Target SNPs in list used in BCFtools filtering step diff --git a/bin/log/pta.nf b/bin/log/pta.nf index 2e98e8f..c3a0219 100644 --- a/bin/log/pta.nf +++ b/bin/log/pta.nf @@ -34,7 +34,9 @@ ______________________________________________________ --quality_phred ${params.quality_phred} --unqualified_perc ${params.unqualified_perc} --detect_adapter_for_pe ${params.detect_adapter_for_pe} ---xenome_prefix ${params.xenome_prefix} +--xengsort_host_fasta ${params.xengsort_host_fasta} +--xengsort_idx_path ${params.xengsort_idx_path} +--xengsort_idx_name ${params.xengsort_idx_name} --ref_fa ${params.ref_fa} --ref_fa_indices ${params.ref_fa_indices} --ref_fa_dict ${params.ref_fa_dict} diff --git a/bin/log/rna_fusion.nf b/bin/log/rna_fusion.nf index fbe1cfb..13695ca 100644 --- a/bin/log/rna_fusion.nf +++ b/bin/log/rna_fusion.nf @@ -27,7 +27,10 @@ ______________________________________________________ --keep_intermediate ${params.keep_intermediate} -c ${params.config} --multiqc_config ${params.multiqc_config} ---xenome_prefix ${params.xenome_prefix} +--ref_fa ${params.ref_fa} +--xengsort_host_fasta ${params.xengsort_host_fasta} +--xengsort_idx_path ${params.xengsort_idx_path} +--xengsort_idx_name ${params.xengsort_idx_name} --read_length ${params.read_length} --star_index ${params.star_index} --star_fusion_star_index ${params.star_fusion_star_index} diff --git a/bin/log/rnaseq.nf b/bin/log/rnaseq.nf index 3a851c7..c788944 100644 --- a/bin/log/rnaseq.nf +++ b/bin/log/rnaseq.nf @@ -47,7 +47,10 @@ ______________________________________________________ --detect_adapter_for_pe ${params.detect_adapter_for_pe} --pdx ${params.pdx} ---xenome_prefix ${params.xenome_prefix} +--ref_fa ${params.ref_fa} +--xengsort_host_fasta ${params.xengsort_host_fasta} +--xengsort_idx_path ${params.xengsort_idx_path} +--xengsort_idx_name ${params.xengsort_idx_name} --strandedness_ref ${params.strandedness_ref} --strandedness_gtf ${params.strandedness_gtf} diff --git a/bin/log/somatic_wes.nf b/bin/log/somatic_wes.nf index 02cf225..51e5484 100644 --- a/bin/log/somatic_wes.nf +++ b/bin/log/somatic_wes.nf @@ -27,7 +27,9 @@ ______________________________________________________ --pubdir ${params.pubdir} --organize_by ${params.organize_by} --pdx ${params.pdx} ---xenome_index ${params.xenome_prefix} +--xengsort_host_fasta ${params.xengsort_host_fasta} +--xengsort_idx_path ${params.xengsort_idx_path} +--xengsort_idx_name ${params.xengsort_idx_name} --ref_fa ${params.ref_fa} --ref_fa_indices ${params.ref_fa_indices} --quality_phred ${params.quality_phred} diff --git a/bin/log/somatic_wes_pta.nf b/bin/log/somatic_wes_pta.nf index 573e85d..dd3ea6b 100644 --- a/bin/log/somatic_wes_pta.nf +++ b/bin/log/somatic_wes_pta.nf @@ -22,7 +22,9 @@ ______________________________________________________ --pubdir ${params.pubdir} --organize_by ${params.organize_by} --pdx ${params.pdx} ---xenome_index ${params.xenome_prefix} +--xengsort_host_fasta ${params.xengsort_host_fasta} +--xengsort_idx_path ${params.xengsort_idx_path} +--xengsort_idx_name ${params.xengsort_idx_name} --ref_fa ${params.ref_fa} --ref_fa_indices ${params.ref_fa_indices} --quality_phred ${params.quality_phred} diff --git a/bin/shared/multiqc/pta_multiqc.yaml b/bin/shared/multiqc/pta_multiqc.yaml index e07d3d8..216616a 100644 --- a/bin/shared/multiqc/pta_multiqc.yaml +++ b/bin/shared/multiqc/pta_multiqc.yaml @@ -8,7 +8,7 @@ export_plots: true module_order: - fastp - fastqc - - xenome + - xengsort - conpair - gatk - picard diff --git a/bin/shared/multiqc/rna_fusion_multiqc.yaml b/bin/shared/multiqc/rna_fusion_multiqc.yaml index 3ba15cc..5d8ff97 100644 --- a/bin/shared/multiqc/rna_fusion_multiqc.yaml +++ b/bin/shared/multiqc/rna_fusion_multiqc.yaml @@ -7,7 +7,7 @@ export_plots: true module_order: - fastqc - - xenome + - xengsort - custom_content table_columns_visible: diff --git a/bin/shared/multiqc/rnaseq_multiqc.yaml b/bin/shared/multiqc/rnaseq_multiqc.yaml index 9022b85..8eb7a03 100644 --- a/bin/shared/multiqc/rnaseq_multiqc.yaml +++ b/bin/shared/multiqc/rnaseq_multiqc.yaml @@ -8,7 +8,7 @@ export_plots: true module_order: - fastp - fastqc - - xenome + - xengsort - star - rsem - picard diff --git a/bin/shared/multiqc/somatic_wes_multiqc.yaml b/bin/shared/multiqc/somatic_wes_multiqc.yaml index fcf2d7c..f3553ec 100644 --- a/bin/shared/multiqc/somatic_wes_multiqc.yaml +++ b/bin/shared/multiqc/somatic_wes_multiqc.yaml @@ -8,7 +8,7 @@ export_plots: true module_order: - fastp - fastqc - - xenome + - xengsort - gatk - picard diff --git a/bin/shared/multiqc/somatic_wes_pta_multiqc.yaml b/bin/shared/multiqc/somatic_wes_pta_multiqc.yaml index fcf2d7c..f3553ec 100644 --- a/bin/shared/multiqc/somatic_wes_pta_multiqc.yaml +++ b/bin/shared/multiqc/somatic_wes_pta_multiqc.yaml @@ -8,7 +8,7 @@ export_plots: true module_order: - fastp - fastqc - - xenome + - xengsort - gatk - picard diff --git a/config/pta.config b/config/pta.config index 2e8c555..f72fa3d 100644 --- a/config/pta.config +++ b/config/pta.config @@ -29,8 +29,10 @@ params { // NOTE: For PE data, the adapter sequence auto-detection is disabled by default since the adapters can be trimmed by overlap analysis. However, you can specify --detect_adapter_for_pe to enable it. // For PE data, fastp will run a little slower if you specify the sequence adapters or enable adapter auto-detection, but usually result in a slightly cleaner output, since the overlap analysis may fail due to sequencing errors or adapter dimers. - // Xenome index - xenome_prefix=params.reference_cache+'/human/GRCh38/supporting_files/xenome/hg38_broad_NOD_based_on_mm10_k25' + // xengsort index + xengsort_host_fasta = params.reference_cache+'/mouse/GRCm39/genome/sequence/imputed/rel_2112_v8/NOD_ShiLtJ.39.fa' + xengsort_idx_path = params.reference_cache+'/human/GRCh38/supporting_files/xengsort' + xengsort_idx_name = 'hg38_GRCm39-NOD_ShiLtJ' // Reference fasta ref_fa = params.reference_cache+'/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' diff --git a/config/rna_fusion.config b/config/rna_fusion.config index a73aee7..b4a6069 100644 --- a/config/rna_fusion.config +++ b/config/rna_fusion.config @@ -2,7 +2,7 @@ manifest { name = "rna_fusion" - description = 'Pipeline for processing of PDX RNASeq samples to call RNA Fusions, contains xenome step for processing PDX samples' + description = 'Pipeline for processing of PDX RNASeq samples to call RNA Fusions, contains xengsort step for processing PDX samples' } params { @@ -21,8 +21,11 @@ params { // PDX pdx = false - // Xenome index - xenome_prefix=params.reference_cache+'/human/GRCh38/supporting_files/xenome/hg38_broad_NOD_based_on_mm10_k25' + // xengsort index + ref_fa = params.reference_cache+'/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' + xengsort_host_fasta = params.reference_cache+'/mouse/GRCm39/genome/sequence/imputed/rel_2112_v8/NOD_ShiLtJ.39.fa' + xengsort_idx_path = params.reference_cache+'/human/GRCh38/supporting_files/xengsort' + xengsort_idx_name = 'hg38_GRCm39-NOD_ShiLtJ' // READ LENGTH ADJUSTMENTS: read_length = 150 // change relative to sample being processed. 75, 100, 125, and 150 are supported. diff --git a/config/rnaseq.config b/config/rnaseq.config index d2b23aa..468014d 100644 --- a/config/rnaseq.config +++ b/config/rnaseq.config @@ -21,7 +21,7 @@ params { csv_input = null - pdx = false // if PDX, gen_org == human and xenome is run to remove mouse reads from the sample(s). + pdx = false // if PDX, gen_org == human and xengsort is run to remove mouse reads from the sample(s). multiqc_config = "${projectDir}/bin/shared/multiqc/rnaseq_multiqc.yaml" @@ -105,8 +105,11 @@ if (params.gen_org=='human' && params.pdx){ params.strandedness_ref = params.reference_cache+'/human/GRCh38/transcriptome/indices/ensembl/v104/kallisto/kallisto_index' params.strandedness_gtf = params.reference_cache+'/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.gtf' - // Xenome - params.xenome_prefix=params.reference_cache+'/human/GRCh38/supporting_files/xenome/hg38_broad_NOD_based_on_mm10_k25' + // include for Xengsort index + params.ref_fa = params.reference_cache+'/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' + params.xengsort_host_fasta = params.reference_cache+'/mouse/GRCm39/genome/sequence/imputed/rel_2112_v8/NOD_ShiLtJ.39.fa' + params.xengsort_idx_path = params.reference_cache+'/human/GRCh38/supporting_files/xengsort' + params.xengsort_idx_name = 'hg38_GRCm39-NOD_ShiLtJ' // General RSEM params.seed_length = '25' diff --git a/config/somatic_wes.config b/config/somatic_wes.config index 5feffbe..fa401ae 100644 --- a/config/somatic_wes.config +++ b/config/somatic_wes.config @@ -40,8 +40,10 @@ params { // Genome Window File hg38_windows = params.reference_cache+'/human/GRCh38/genome/annotation/intervals/hg38_chrom_sizes.window.1000000.bed' - // Xenome index - xenome_prefix=params.reference_cache+'/human/GRCh38/supporting_files/xenome/hg38_broad_NOD_based_on_mm10_k25' + // Xengsort index + xengsort_host_fasta = params.reference_cache+'/mouse/GRCm39/genome/sequence/imputed/rel_2112_v8/NOD_ShiLtJ.39.fa' + xengsort_idx_path = params.reference_cache+'/human/GRCh38/supporting_files/xengsort' + xengsort_idx_name = 'hg38_GRCm39-NOD_ShiLtJ' // WES capture array BED and GATK intervals lists target_gatk = params.reference_cache+'/human/GRCh38/supporting_files/capture_kit_files/agilent/v7/S31285117_MergedProbes_no_gene_names.bed' diff --git a/config/somatic_wes_pta.config b/config/somatic_wes_pta.config index e20d96f..ebf354b 100644 --- a/config/somatic_wes_pta.config +++ b/config/somatic_wes_pta.config @@ -37,8 +37,10 @@ params { // FFPE ffpe = false - // Xenome index - xenome_prefix=params.reference_cache+'/human/GRCh38/supporting_files/xenome/hg38_broad_NOD_based_on_mm10_k25' + // include for Xengsort index + xengsort_host_fasta = params.reference_cache+'/mouse/GRCm39/genome/sequence/imputed/rel_2112_v8/NOD_ShiLtJ.39.fa' + xengsort_idx_path = params.reference_cache+'/human/GRCh38/supporting_files/xengsort' + xengsort_idx_name = 'hg38_GRCm39-NOD_ShiLtJ' // WES capture array BED and GATK intervals lists target_gatk = params.reference_cache+'/human/GRCh38/supporting_files/capture_kit_files/agilent/v7/S31285117_MergedProbes_no_gene_names.bed' diff --git a/modules/gbrs/gbrs_bam2emase.nf b/modules/gbrs/gbrs_bam2emase.nf index 2542e45..11c6285 100644 --- a/modules/gbrs/gbrs_bam2emase.nf +++ b/modules/gbrs/gbrs_bam2emase.nf @@ -2,7 +2,7 @@ process GBRS_BAM2EMASE { tag "$sampleID" cpus 1 - memory { params.read_type == 'SE' ? 12.GB : 200.GB } + memory 200.GB time 6.hour errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} diff --git a/modules/gbrs/gbrs_compress.nf b/modules/gbrs/gbrs_compress.nf index 8ee6823..dfd24eb 100644 --- a/modules/gbrs/gbrs_compress.nf +++ b/modules/gbrs/gbrs_compress.nf @@ -2,7 +2,7 @@ process GBRS_COMPRESS { tag "$sampleID" cpus 1 - memory { params.read_type == 'SE' ? 12.GB : 250.GB } + memory { params.read_type == 'SE' ? 50.GB : 250.GB } time 5.hour errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} diff --git a/modules/jaffa/jaffa.nf b/modules/jaffa/jaffa.nf index 6749f14..296601a 100644 --- a/modules/jaffa/jaffa.nf +++ b/modules/jaffa/jaffa.nf @@ -26,7 +26,7 @@ process JAFFA { bpipe run -v \ -n ${task.cpus} \ - -p fastqInputFormat='%_*.${ext}' \ + -p fastqInputFormat='*.${ext}' \ -p refBase=${params.jaffa_ref_dir} \ -p genome=hg38 \ -p annotation=genCode22 \ @@ -38,4 +38,4 @@ process JAFFA { mv jaffa_results.fasta ${sampleID}_jaffa_fusions.fasta ; """ -} \ No newline at end of file +} diff --git a/modules/utility_modules/gzip.nf b/modules/utility_modules/gzip.nf index a7dc16a..b2cebec 100644 --- a/modules/utility_modules/gzip.nf +++ b/modules/utility_modules/gzip.nf @@ -9,7 +9,7 @@ process GZIP { container "quay.io/jaxcompsci/py3_perl_pylibs:v2" - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/processed_reads' : 'xenome' }", pattern: "*.gz", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/processed_reads' : 'xengsort' }", pattern: "*.gz", mode:'copy' input: tuple val(sampleID), path(reads) diff --git a/modules/xengsort/xengsort_classify.nf b/modules/xengsort/xengsort_classify.nf new file mode 100644 index 0000000..7b7950d --- /dev/null +++ b/modules/xengsort/xengsort_classify.nf @@ -0,0 +1,65 @@ +process XENGSORT_CLASSIFY { + + tag "$sampleID" + + // resource utilization + cpus 32 + memory 60.GB + time 48.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + // load xengsort container + container 'quay.io/biocontainers/xengsort:2.0.5--pyhdfd78af_0' + + // output directory + // publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/xengsort/xengsort_classify' : 'xengsort'}", pattern: "*.fq", mode: "copy" + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/stats': 'xengsort' }", pattern: "*.txt", mode:'copy' + + // inputs + input: + path(xengsort_index) + tuple val(sampleID), path(trimmed) + + output: + tuple val(sampleID), path("fastq-graft.*.fq"), emit: xengsort_human_fastq + tuple val(sampleID), path("fastq-host.*.fq"), emit: xengsort_mouse_fastq + tuple val(sampleID), path("*.txt"), emit: xengsort_log + + script: + + // specify single-end or paired-end data + if (params.read_type == "SE") + + """ + + xengsort classify \ + --index ${xengsort_index}/${xengsort_index} \ + --fastq ${trimmed[0]} \ + --prefix ${sampleID} \ + --mode count \ + --threads ${task.cpus} + --out fastq \ + --chunksize 32.0 \ + --compression none &> ${sampleID}_xengsort_log.txt + + """ + + else if (params.read_type == "PE") + + """ + + xengsort classify \ + --index ${xengsort_index}/${params.xengsort_idx_name} \ + --fastq ${trimmed[0]} \ + --pairs ${trimmed[1]} \ + --prefix ${sampleID} \ + --mode count \ + --threads ${task.cpus} \ + --out fastq \ + --chunksize 32.0 \ + --compression none &> ${sampleID}_xengsort_log.txt + + """ + + else error "${params.read_type} is invalid, specify either SE or PE" +} \ No newline at end of file diff --git a/modules/xengsort/xengsort_index.nf b/modules/xengsort/xengsort_index.nf new file mode 100644 index 0000000..8361a8c --- /dev/null +++ b/modules/xengsort/xengsort_index.nf @@ -0,0 +1,37 @@ +process XENGSORT_INDEX { + + // resource utilization + cpus 32 + memory 60.GB + time 1.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + // load xengsort container + container 'quay.io/biocontainers/xengsort:2.0.5--pyhdfd78af_0' + + // output directory + publishDir "${params.pubdir}/xengsort/", mode: 'copy' + + // inputs + input: + path(xengsort_host_fasta) + path(graft_fasta) + + output: + // index output + path("xengsort_index/"), emit: xengsort_index + + script: + """ + xengsort index \ + --index ${params.xengsort_idx_name} \ + -H ${xengsort_host_fasta} \ + -G ${graft_fasta} \ + -k 25 \ + -n 4_500_000_000 \ + -W ${task.cpus} + + mkdir xengsort_index + mv *.info *.hash xengsort_index/ + """ +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index c479adc..16a7401 100644 --- a/nextflow.config +++ b/nextflow.config @@ -46,7 +46,7 @@ manifest { mainScript = "main.nf" nextflowVersion = "!>=22.04.3" version = "0.6.2" - author = 'Michael Lloyd, Brian Sanderson, Barry Guglielmo, Sai Lek, Peter Fields, Harshpreet Chandok, Carolyn Paisie, Gabriel Rech, Anuj Srivastava. Copyright Jackson Laboratory 2024' + author = 'Michael Lloyd, Brian Sanderson, Barry Guglielmo, Sai Lek, Peter Fields, Harshpreet Chandok, Carolyn Paisie, Gabriel Rech, Ardian Ferraj, Anuj Srivastava. Copyright Jackson Laboratory 2024' } profiles { diff --git a/subworkflows/hs_pta.nf b/subworkflows/hs_pta.nf index 6f71642..62e1218 100644 --- a/subworkflows/hs_pta.nf +++ b/subworkflows/hs_pta.nf @@ -6,7 +6,8 @@ include {CLUMPIFY} from "${projectDir}/modules/bbmap/bbmap_clumpify" include {FASTP} from "${projectDir}/modules/fastp/fastp" include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" -include {XENOME_CLASSIFY} from "${projectDir}/modules/xenome/xenome" +include {XENGSORT_INDEX} from "${projectDir}/modules/xengsort/xengsort_index" +include {XENGSORT_CLASSIFY} from "${projectDir}/modules/xengsort/xengsort_classify" include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" include {PICARD_SORTSAM} from "${projectDir}/modules/picard/picard_sortsam" include {SAMTOOLS_MERGE} from "${projectDir}/modules/samtools/samtools_merge" @@ -151,8 +152,8 @@ workflow HS_PTA { // PDX CASES TO ADD AND VALIDATE: // Normal samples should PASS the PDX step. - // ** Step 2a: Xenome if PDX data used. - ch_XENOME_CLASSIFY_multiqc = Channel.empty() //optional log file. + // ** Step 2a: Xengsort if PDX data used. + ch_XENGSORT_CLASSIFY_multiqc = Channel.empty() //optional log file. if (params.pdx){ FASTP.out.trimmed_fastq.join(meta_ch).branch{ @@ -162,11 +163,19 @@ workflow HS_PTA { normal_fastqs = fastq_files.normal.map{it -> [it[0], it[1]] } - // Xenome Classification - XENOME_CLASSIFY(fastq_files.tumor.map{it -> [it[0], it[1]] }) - ch_XENOME_CLASSIFY_multiqc = XENOME_CLASSIFY.out.xenome_stats // set log file for multiqc + // Generate Xengsort Index if needed + if (params.xengsort_idx_path) { + xengsort_index = params.xengsort_idx_path + } else { + XENGSORT_INDEX(params.xengsort_host_fasta, params.ref_fa) + xengsort_index = XENGSORT_INDEX.out.xengsort_index + } + + // Xengsort Classification + XENGSORT_CLASSIFY(xengsort_index, fastq_files.tumor.map{it -> [it[0], it[1]] }) + ch_XENGSORT_CLASSIFY_multiqc = XENGSORT_CLASSIFY.out.xengsort_log - bwa_mem_mapping = XENOME_CLASSIFY.out.xenome_human_fastq.mix(normal_fastqs).join(READ_GROUPS.out.read_groups) + bwa_mem_mapping = XENGSORT_CLASSIFY.out.xengsort_human_fastq.mix(normal_fastqs).join(READ_GROUPS.out.read_groups) .map{it -> [it[0], it[1], 'aln', it[2]]} } else { @@ -178,7 +187,7 @@ workflow HS_PTA { if (params.split_fastq) { if (params.read_type == 'PE') { - split_fastq_files = FASTP.out.trimmed_fastq + split_fastq_files = bwa_mem_mapping .map{it -> [it[0], it[1][0], it[1][1]]} .splitFastq(by: params.split_fastq_bin_size, file: true, pe: true) .map{it -> [it[0], [it[1], it[2]], it[1].name.split('\\.')[-2]]} @@ -187,7 +196,7 @@ workflow HS_PTA { // splitFastq adds an increment between *R* and .fastq. // This can be used to set an 'index' value to make file names unique. } else { - split_fastq_files = FASTP.out.trimmed_fastq + split_fastq_files = bwa_mem_mapping .map{it -> [it[0], it[1]]} .splitFastq(by: params.split_fastq_bin_size, file: true) .map{it -> [it[0], it[1], it[1].name.split('\\.')[-2]]} @@ -864,7 +873,7 @@ workflow HS_PTA { ch_multiqc_files = Channel.empty() ch_multiqc_files = ch_multiqc_files.mix(FASTP.out.quality_json.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_XENOME_CLASSIFY_multiqc.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_XENGSORT_CLASSIFY_multiqc.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(GATK_BASERECALIBRATOR.out.table.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTALIGNMENTSUMMARYMETRICS.out.txt.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTWGSMETRICS.out.txt.collect{it[1]}.ifEmpty([])) diff --git a/subworkflows/pdx_rnaseq.nf b/subworkflows/pdx_rnaseq.nf index 0a84098..10392a1 100644 --- a/subworkflows/pdx_rnaseq.nf +++ b/subworkflows/pdx_rnaseq.nf @@ -8,7 +8,8 @@ include {READ_GROUPS as READ_GROUPS_HUMAN; include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" include {GET_READ_LENGTH} from "${projectDir}/modules/utility_modules/get_read_length" include {CHECK_STRANDEDNESS} from "${projectDir}/modules/python/python_check_strandedness" -include {XENOME_CLASSIFY} from "${projectDir}/modules/xenome/xenome" +include {XENGSORT_INDEX} from "${projectDir}/modules/xengsort/xengsort_index" +include {XENGSORT_CLASSIFY} from "${projectDir}/modules/xengsort/xengsort_classify" // include {GZIP as GZIP_HUMAN; // GZIP as GZIP_MOUSE} from "${projectDir}/modules/utility_modules/gzip" include {RSEM_ALIGNMENT_EXPRESSION as RSEM_ALIGNMENT_EXPRESSION_HUMAN; @@ -30,15 +31,15 @@ workflow PDX_RNASEQ { read_ch main: - // Step 1: Read trim, Get read group information, Run Xenome + // Step 1: Read trim, Get read group information, Run xengsort FASTP(read_ch) GET_READ_LENGTH(read_ch) if (params.read_type == 'PE') { - xenome_input = FASTP.out.trimmed_fastq + xengsort_input = FASTP.out.trimmed_fastq } else { - xenome_input = FASTP.out.trimmed_fastq + xengsort_input = FASTP.out.trimmed_fastq } // QC is assess on all reads. Mouse/human is irrelevant here. @@ -46,22 +47,27 @@ workflow PDX_RNASEQ { CHECK_STRANDEDNESS(FASTP.out.trimmed_fastq) - // Xenome Classification - XENOME_CLASSIFY(xenome_input) + // Generate Xengsort Index if needed + if (params.xengsort_idx_path) { + xengsort_index = params.xengsort_idx_path + } else { + XENGSORT_INDEX(params.xengsort_host_fasta, params.ref_fa) + xengsort_index = XENGSORT_INDEX.out.xengsort_index + } - human_reads = XENOME_CLASSIFY.out.xenome_human_fastq + // Xengsort Classification + XENGSORT_CLASSIFY(xengsort_index, xengsort_input) + + human_reads = XENGSORT_CLASSIFY.out.xengsort_human_fastq .join(CHECK_STRANDEDNESS.out.strand_setting) .join(GET_READ_LENGTH.out.read_length) .map{it -> tuple(it[0]+'_human', it[1], it[2], it[3])} - mouse_reads = XENOME_CLASSIFY.out.xenome_mouse_fastq + mouse_reads = XENGSORT_CLASSIFY.out.xengsort_mouse_fastq .join(CHECK_STRANDEDNESS.out.strand_setting) .join(GET_READ_LENGTH.out.read_length) .map{it -> tuple(it[0]+'_mouse', it[1], it[2], it[3])} - // GZIP_HUMAN(XENOME_CLASSIFY.out.xenome_human_fastq) - // GZIP_MOUSE(XENOME_CLASSIFY.out.xenome_mouse_fastq) - // Step 2: RSEM Human and Stats: RSEM_ALIGNMENT_EXPRESSION_HUMAN(human_reads, params.rsem_ref_files_human, params.rsem_star_prefix_human, params.rsem_ref_prefix_human) @@ -107,7 +113,7 @@ workflow PDX_RNASEQ { ch_multiqc_files = Channel.empty() ch_multiqc_files = ch_multiqc_files.mix(FASTP.out.quality_json.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(XENOME_CLASSIFY.out.xenome_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(XENGSORT_CLASSIFY.out.xengsort_log.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(RSEM_ALIGNMENT_EXPRESSION_HUMAN.out.rsem_cnt.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(RSEM_ALIGNMENT_EXPRESSION_HUMAN.out.star_log.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTRNASEQMETRICS_HUMAN.out.picard_metrics.collect{it[1]}.ifEmpty([])) diff --git a/tests/workflows/pta.nf.test b/tests/workflows/pta.nf.test index b50bc62..5c7fe23 100644 --- a/tests/workflows/pta.nf.test +++ b/tests/workflows/pta.nf.test @@ -47,7 +47,6 @@ nextflow_workflow { test("Full Workflow -- Human -- Deduplicate and Coverage Cap") { tag "GRCh38" - tag "pdx" tag "options" when { params { diff --git a/tests/workflows/rnaseq.nf.test b/tests/workflows/rnaseq.nf.test index af0f2a3..2f49a48 100644 --- a/tests/workflows/rnaseq.nf.test +++ b/tests/workflows/rnaseq.nf.test @@ -69,7 +69,7 @@ nextflow_workflow { params { outdir = "tests/results" gen_org = "human" - sample_folder = "${baseDir}/test/rna/human" + sample_folder = "${baseDir}/test/rna/human/pdx" pubdir = "tests/results" pdx = true pipeline = 'rnaseq' diff --git a/tests/workflows/somatic_wes_pta.nf.test b/tests/workflows/somatic_wes_pta.nf.test index ad39a9f..b4785e3 100644 --- a/tests/workflows/somatic_wes_pta.nf.test +++ b/tests/workflows/somatic_wes_pta.nf.test @@ -26,6 +26,7 @@ nextflow_workflow { test("Full Workflow -- PDX") { tag "GRCh38" tag "primary" + tag "pdx" when { params { outdir = "tests/results" diff --git a/workflows/rna_fusion.nf b/workflows/rna_fusion.nf index 46db643..5fec7d6 100644 --- a/workflows/rna_fusion.nf +++ b/workflows/rna_fusion.nf @@ -10,7 +10,8 @@ include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" include {GUNZIP} from "${projectDir}/modules/utility_modules/gunzip" -include {XENOME_CLASSIFY} from "${projectDir}/modules/xenome/xenome" +include {XENGSORT_INDEX} from "${projectDir}/modules/xengsort/xengsort_index" +include {XENGSORT_CLASSIFY} from "${projectDir}/modules/xengsort/xengsort_classify" include {STAR_ALIGN as STAR_ARRIBA; STAR_ALIGN as STAR_SQUID; STAR_ALIGN as STAR_STARFUSION} from "${projectDir}/modules/star/star_align" @@ -105,14 +106,23 @@ workflow RNA_FUSION { FASTQC(GUNZIP.out.gunzip_fastq) - // Step 1a: Xenome if PDX data used. - ch_XENOME_CLASSIFY_multiqc = Channel.empty() //optional log file. + // Step 1a: Xengsort if PDX data used. + ch_XENGSORT_CLASSIFY_multiqc = Channel.empty() //optional log file. if (params.pdx){ - // Xenome Classification - XENOME_CLASSIFY(GUNZIP.out.gunzip_fastq) - ch_XENOME_CLASSIFY_multiqc = XENOME_CLASSIFY.out.xenome_stats //set log file for multiqc - fusion_tool_input = XENOME_CLASSIFY.out.xenome_human_fastq + // Generate Xengsort Index if needed + if (params.xengsort_idx_path) { + xengsort_index = params.xengsort_idx_path + } else { + XENGSORT_INDEX(params.xengsort_host_fasta, params.ref_fa) + xengsort_index = XENGSORT_INDEX.out.xengsort_index + } + + // Xengsort Classification + XENGSORT_CLASSIFY(xengsort_index, GUNZIP.out.gunzip_fastq) + ch_XENGSORT_CLASSIFY_multiqc = XENGSORT_CLASSIFY.out.xengsort_log + + fusion_tool_input = XENGSORT_CLASSIFY.out.xengsort_human_fastq } else { fusion_tool_input = GUNZIP.out.gunzip_fastq @@ -155,9 +165,9 @@ workflow RNA_FUSION { // Step 5: MultiQC ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(ch_XENGSORT_CLASSIFY_multiqc.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(FUSION_REPORT.out.summary_fusions_mq.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_XENOME_CLASSIFY_multiqc.collect{it[1]}.ifEmpty([])) MULTIQC ( ch_multiqc_files.collect() diff --git a/workflows/somatic_wes.nf b/workflows/somatic_wes.nf index 8e6270c..626f46b 100755 --- a/workflows/somatic_wes.nf +++ b/workflows/somatic_wes.nf @@ -12,7 +12,8 @@ include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/conca include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" include {FASTP} from "${projectDir}/modules/fastp/fastp" include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" -include {XENOME_CLASSIFY} from "${projectDir}/modules/xenome/xenome" +include {XENGSORT_INDEX} from "${projectDir}/modules/xengsort/xengsort_index" +include {XENGSORT_CLASSIFY} from "${projectDir}/modules/xengsort/xengsort_classify" // include {GZIP} from "${projectDir}/modules/utility_modules/gzip" include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" @@ -161,27 +162,31 @@ workflow SOMATIC_WES { // Step 1: Read Trim FASTP(read_ch) - - xenome_input = FASTP.out.trimmed_fastq FASTQC(FASTP.out.trimmed_fastq) // Step 3: Get Read Group Information READ_GROUPS(FASTP.out.trimmed_fastq, "gatk") - // Step 1a: Xenome if PDX data used. - ch_XENOME_CLASSIFY_multiqc = Channel.empty() //optional log file. + // Step 1a: Run Xengsort if PDX data used. + ch_XENGSORT_CLASSIFY_multiqc = Channel.empty() //optional log file. if (params.pdx){ - // Xenome Classification - XENOME_CLASSIFY(FASTP.out.trimmed_fastq) - ch_XENOME_CLASSIFY_multiqc = XENOME_CLASSIFY.out.xenome_stats //set log file for multiqc - // GZIP(XENOME_CLASSIFY.out.xenome_human_fastq) + // Generate Xengsort Index if needed + if (params.xengsort_idx_path) { + xengsort_index = params.xengsort_idx_path + } else { + XENGSORT_INDEX(params.xengsort_host_fasta, params.ref_fa) + xengsort_index = XENGSORT_INDEX.out.xengsort_index + } + // Xengsort Classification + XENGSORT_CLASSIFY(xengsort_index, FASTP.out.trimmed_fastq) + ch_XENGSORT_CLASSIFY_multiqc = XENGSORT_CLASSIFY.out.xengsort_log + // Step 4: BWA-MEM Alignment - bwa_mem_mapping = XENOME_CLASSIFY.out.xenome_human_fastq.join(READ_GROUPS.out.read_groups) + bwa_mem_mapping = XENGSORT_CLASSIFY.out.xengsort_human_fastq.join(READ_GROUPS.out.read_groups) .map{it -> [it[0], it[1], 'aln', it[2]]} - } else { bwa_mem_mapping = FASTP.out.trimmed_fastq.join(READ_GROUPS.out.read_groups) .map{it -> [it[0], it[1], 'aln', it[2]]} @@ -255,7 +260,6 @@ workflow SOMATIC_WES { GATK_VARIANTFILTRATION_INDEL(var_filter_indel, 'INDEL') // Step 9: Post Variant Calling Processing - Part 1 - // SNPSIFT_ANNOTATE_SNP_DBSNP(GATK_VARIANTFILTRATION_SNP.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') SNPSIFT_ANNOTATE_SNP_COSMIC(SNPSIFT_ANNOTATE_SNP_DBSNP.out.vcf, params.cosmic, params.cosmic_index, 'cosmicID') SNPEFF_SNP(SNPSIFT_ANNOTATE_SNP_COSMIC.out.vcf, 'SNP', 'vcf') @@ -285,10 +289,10 @@ workflow SOMATIC_WES { ch_multiqc_files = Channel.empty() ch_multiqc_files = ch_multiqc_files.mix(FASTP.out.quality_json.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_XENGSORT_CLASSIFY_multiqc.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(GATK_BASERECALIBRATOR.out.table.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTHSMETRICS.out.hsmetrics.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(PICARD_MARKDUPLICATES.out.dedup_metrics.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_XENOME_CLASSIFY_multiqc.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(GATK_FILTERMUECTCALLS.out.stats.collect{it[1]}.ifEmpty([])) MULTIQC ( diff --git a/workflows/somatic_wes_pta.nf b/workflows/somatic_wes_pta.nf index a54ea66..e81c6dc 100755 --- a/workflows/somatic_wes_pta.nf +++ b/workflows/somatic_wes_pta.nf @@ -9,7 +9,8 @@ include {CONCATENATE_PTA_FASTQ} from "${projectDir}/subworkflows/concatenate_pta include {FASTP} from "${projectDir}/modules/fastp/fastp" include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" -include {XENOME_CLASSIFY} from "${projectDir}/modules/xenome/xenome" +include {XENGSORT_INDEX} from "${projectDir}/modules/xengsort/xengsort_index" +include {XENGSORT_CLASSIFY} from "${projectDir}/modules/xengsort/xengsort_classify" // include {GZIP} from "${projectDir}/modules/utility_modules/gzip" include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" @@ -112,8 +113,8 @@ workflow SOMATIC_WES_PTA { // Step 3: Get Read Group Information READ_GROUPS(FASTP.out.trimmed_fastq, "gatk") - // Step 1a: Xenome if PDX data used. - ch_XENOME_CLASSIFY_multiqc = Channel.empty() //optional log file. + // Step 1a: Xengsort if PDX data used. + ch_XENGSORT_CLASSIFY_multiqc = Channel.empty() //optional log file. if (params.pdx){ FASTP.out.trimmed_fastq.join(meta_ch).branch{ normal: it[2].status == 0 @@ -122,16 +123,21 @@ workflow SOMATIC_WES_PTA { normal_fastqs = fastq_files.normal.map{it -> [it[0], it[1]] } - // Xenome Classification - XENOME_CLASSIFY(fastq_files.tumor.map{it -> [it[0], it[1]] }) - ch_XENOME_CLASSIFY_multiqc = XENOME_CLASSIFY.out.xenome_stats //set log file for multiqc + // Generate Xengsort Index if needed + if (params.xengsort_idx_path) { + xengsort_index = params.xengsort_idx_path + } else { + XENGSORT_INDEX(params.xengsort_host_fasta, params.ref_fa) + xengsort_index = XENGSORT_INDEX.out.xengsort_index + } - // GZIP(XENOME_CLASSIFY.out.xenome_human_fastq) + // Xengsort Classification + XENGSORT_CLASSIFY(xengsort_index, fastq_files.tumor.map{it -> [it[0], it[1]] }) + ch_XENGSORT_CLASSIFY_multiqc = XENGSORT_CLASSIFY.out.xengsort_log // Step 4: BWA-MEM Alignment - bwa_mem_mapping = XENOME_CLASSIFY.out.xenome_human_fastq.mix(normal_fastqs).join(READ_GROUPS.out.read_groups) + bwa_mem_mapping = XENGSORT_CLASSIFY.out.xengsort_human_fastq.mix(normal_fastqs).join(READ_GROUPS.out.read_groups) .map{it -> [it[0], it[1], 'aln', it[2]]} - } else { bwa_mem_mapping = FASTP.out.trimmed_fastq.join(READ_GROUPS.out.read_groups) .map{it -> [it[0], it[1], 'aln', it[2]]} @@ -314,10 +320,10 @@ workflow SOMATIC_WES_PTA { ch_multiqc_files = Channel.empty() ch_multiqc_files = ch_multiqc_files.mix(FASTP.out.quality_json.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_XENGSORT_CLASSIFY_multiqc.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(GATK_BASERECALIBRATOR.out.table.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTHSMETRICS.out.hsmetrics.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(PICARD_MARKDUPLICATES.out.dedup_metrics.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_XENOME_CLASSIFY_multiqc.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(GATK_FILTERMUECTCALLS.out.stats.collect{it[1]}.ifEmpty([])) MULTIQC (