diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index 9f3bf8022..5638fdc8e 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -32,6 +32,8 @@ Added - Add filtering ``Variant`` by ``VariantExperiment`` and ``VariantCall`` - Use generic permission filters when filtering variants by related models - Return only distinct ``VariantExperiment`` objects +- Add ``--bam-output`` input argument to ``vc-gatk4-hc`` +- Add ``--max-mnp-distance`` input argument to ``vc-gatk4-hc`` =================== diff --git a/resolwe_bio/processes/variant_calling/gatk4_hc.py b/resolwe_bio/processes/variant_calling/gatk4_hc.py index aec948f4f..0bb905f21 100644 --- a/resolwe_bio/processes/variant_calling/gatk4_hc.py +++ b/resolwe_bio/processes/variant_calling/gatk4_hc.py @@ -36,7 +36,7 @@ class GatkHaplotypeCaller(Process): name = "GATK4 (HaplotypeCaller)" category = "GATK" process_type = "data:variants:vcf:gatk:hc" - version = "1.5.0" + version = "1.6.0" scheduling_class = SchedulingClass.BATCH entity = {"type": "sample"} requirements = { @@ -113,6 +113,33 @@ class Advanced: default=12, description="Set the maximum Java heap size (in GB).", ) + bam_out = BooleanField( + label="Output a BAM containing assembled haplotypes.", + default=False, + description="Output a BAM containing assembled haplotypes. " + "Really for debugging purposes only. " + "Note that the output here does not include uninformative reads so that not every input read is emitted to the bam. " + "Turning on this mode may result in serious performance cost for the caller.", + ) + bam_writer_type = StringField( + label="BAM writer type", + default="CALLED_HAPLOTYPES", + description="The type of BAM output. " + "This determines whether HC will write out all of the haplotypes it considered (top 128 max) " + "or just the ones that were selected as alleles and assigned to samples.", + choices=[ + ("CALLED_HAPLOTYPES", "Assigned haplotypes"), + ("ALL_POSSIBLE_HAPLOTYPES", "All considered haplotypes"), + ("NO_HAPLOTYPES", "Do not output haplotypes"), + ], + disabled="!bam_out", + ) + max_mnp_distance = IntegerField( + label="Max MNP distance", + default=0, + description="Two or more phased substitutions separated by this distance or less are merged into MNPs. " + "Set 0 to disable.", + ) advanced = GroupField(Advanced, label="Advanced options") @@ -121,6 +148,8 @@ class Output: vcf = FileField(label="VCF file") tbi = FileField(label="Tabix index") + bam = FileField(label="Alignment file", required=False) + bai = FileField(label="BAM file index", required=False) species = StringField(label="Species") build = StringField(label="Build") @@ -157,6 +186,8 @@ def run(self, inputs, outputs): inputs.stand_call_conf, "--tmp-dir", TMPDIR, + "--max-mnp-distance", + inputs.advanced.max_mnp_distance, ] if inputs.advanced.soft_clipped: @@ -168,6 +199,11 @@ def run(self, inputs, outputs): if inputs.advanced.interval_padding: args.extend(["--interval-padding", inputs.advanced.interval_padding]) + if inputs.advanced.bam_out: + output_bam = name + ".gatkHC.bam" + args.extend(["--bam-output", output_bam]) + args.extend(["--bam-writer-type", inputs.advanced.bam_writer_type]) + return_code, stdout, stderr = Cmd["gatk"]["HaplotypeCaller"][args] & TEE( retcode=None ) @@ -188,3 +224,9 @@ def run(self, inputs, outputs): outputs.tbi = variants_index outputs.species = inputs.alignment.output.species outputs.build = inputs.alignment.output.build + if inputs.advanced.bam_out: + output_bai = Path(name + ".gatkHC.bai") + renamed_bai = output_bai.with_suffix(".bam.bai") + output_bai.rename(renamed_bai) + outputs.bam = output_bam + outputs.bai = str(renamed_bai) diff --git a/resolwe_bio/tests/files/haplotypecaller/output/56GSID_10k.rna-seq.gatkHC.bam b/resolwe_bio/tests/files/haplotypecaller/output/56GSID_10k.rna-seq.gatkHC.bam new file mode 100644 index 000000000..302bea274 Binary files /dev/null and b/resolwe_bio/tests/files/haplotypecaller/output/56GSID_10k.rna-seq.gatkHC.bam differ diff --git a/resolwe_bio/tests/files/haplotypecaller/output/56GSID_10k.rna-seq.gatkHC.bam.bai b/resolwe_bio/tests/files/haplotypecaller/output/56GSID_10k.rna-seq.gatkHC.bam.bai new file mode 100644 index 000000000..f72c85431 Binary files /dev/null and b/resolwe_bio/tests/files/haplotypecaller/output/56GSID_10k.rna-seq.gatkHC.bam.bai differ diff --git a/resolwe_bio/tests/processes/test_variant_calling.py b/resolwe_bio/tests/processes/test_variant_calling.py index 408bcdf6b..241774a79 100644 --- a/resolwe_bio/tests/processes/test_variant_calling.py +++ b/resolwe_bio/tests/processes/test_variant_calling.py @@ -193,6 +193,8 @@ def test_gatk_haplotypecaller(self): "soft_clipped": True, "java_gc_threads": 3, "max_heap_size": 7, + "bam_out": True, + "bam_writer_type": "CALLED_HAPLOTYPES", }, }, ) @@ -205,6 +207,12 @@ def test_gatk_haplotypecaller(self): file_filter=filter_vcf_variable, compression="gzip", ) + self.assertFile( + gatk_rnaseq, "bam", output_folder / "56GSID_10k.rna-seq.gatkHC.bam" + ) + self.assertFile( + gatk_rnaseq, "bai", output_folder / "56GSID_10k.rna-seq.gatkHC.bam.bai" + ) @tag_process("snpeff", "snpeff-single") def test_snpeff(self):