Skip to content

Commit

Permalink
Add BAM output option to vc-gatk4-hc
Browse files Browse the repository at this point in the history
  • Loading branch information
marcellevstek committed Dec 5, 2024
1 parent 4297a2d commit ac9e29c
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Unreleased
Added
-----
- Add filtering ``Variant`` by ``VariantExperiment`` and ``VariantCall``
- Add BAM output option to ``vc-gatk4-hc``

Changed
-------
Expand Down
31 changes: 30 additions & 1 deletion resolwe_bio/processes/variant_calling/gatk4_hc.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class GatkHaplotypeCaller(Process):
name = "GATK4 (HaplotypeCaller)"
category = "GATK"
process_type = "data:variants:vcf:gatk:hc"
version = "1.5.0"
version = "1.6.0"
scheduling_class = SchedulingClass.BATCH
entity = {"type": "sample"}
requirements = {
Expand Down Expand Up @@ -113,6 +113,27 @@ class Advanced:
default=12,
description="Set the maximum Java heap size (in GB).",
)
bam_out = BooleanField(
label="Output a BAM containing assembled haplotypes.",
default=False,
description="Output a BAM containing assembled haplotypes. "
"Really for debugging purposes only. "
"Note that the output here does not include uninformative reads so that not every input read is emitted to the bam. "
"Turning on this mode may result in serious performance cost for the caller.",
)
bam_writer_type = StringField(
label="BAM writer type",
default="CALLED_HAPLOTYPES",
description="The type of BAM output. "
"This determines whether HC will write out all of the haplotypes it considered (top 128 max) "
"or just the ones that were selected as alleles and assigned to samples.",
choices=[
("CALLED_HAPLOTYPES", "Assigned haplotypes"),
("ALL_POSSIBLE_HAPLOTYPES", "All considered haplotypes"),
("NO_HAPLOTYPES", "Do not output haplotypes"),
],
disabled="!bam_out",
)

advanced = GroupField(Advanced, label="Advanced options")

Expand All @@ -121,6 +142,7 @@ class Output:

vcf = FileField(label="VCF file")
tbi = FileField(label="Tabix index")
bam = FileField(label="BAM file", required=False)
species = StringField(label="Species")
build = StringField(label="Build")

Expand Down Expand Up @@ -168,6 +190,11 @@ def run(self, inputs, outputs):
if inputs.advanced.interval_padding:
args.extend(["--interval-padding", inputs.advanced.interval_padding])

if inputs.advanced.bam_out:
output_bam = name + ".gatkHC.bam"
args.extend(["--bam-output", output_bam])
args.extend(["--bam-writer-type", inputs.advanced.bam_writer_type])

return_code, stdout, stderr = Cmd["gatk"]["HaplotypeCaller"][args] & TEE(
retcode=None
)
Expand All @@ -188,3 +215,5 @@ def run(self, inputs, outputs):
outputs.tbi = variants_index
outputs.species = inputs.alignment.output.species
outputs.build = inputs.alignment.output.build
if inputs.advanced.bam_out:
outputs.bam = output_bam
Binary file not shown.
5 changes: 5 additions & 0 deletions resolwe_bio/tests/processes/test_variant_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ def test_gatk_haplotypecaller(self):
"soft_clipped": True,
"java_gc_threads": 3,
"max_heap_size": 7,
"bam_out": True,
"bam_writer_type": "CALLED_HAPLOTYPES",
},
},
)
Expand All @@ -205,6 +207,9 @@ def test_gatk_haplotypecaller(self):
file_filter=filter_vcf_variable,
compression="gzip",
)
self.assertFile(
gatk_rnaseq, "bam", output_folder / "56GSID_10k.rna-seq.gatkHC.bam"
)

@tag_process("snpeff", "snpeff-single")
def test_snpeff(self):
Expand Down

0 comments on commit ac9e29c

Please sign in to comment.